# %% import json import pandas as pd ########################################## # %% # import entity information # %% data_path = 'entity.csv' entity_df = pd.read_csv(data_path, skipinitialspace=True) id2label = {} for _, row in entity_df.iterrows(): id2label[row['id']] = row['name'] # Load the JSON file data_path = '../esAppMod/train.json' with open(data_path, 'r') as file: data = json.load(file) # Initialize an empty list to store the rows rows = [] # Loop through all entities in the JSON for entity_key, entity_data in data["data"].items(): mentions = entity_data["mentions"] entity_id = entity_data["entity_id"] entity_name = id2label[entity_id] # Add each mention and its entity_id to the rows list for mention in mentions: rows.append( { "mention": mention, "entity_id": entity_id, "entity_name": entity_name }) # Create a DataFrame from the rows train_df = pd.DataFrame(rows) train_class_set = set(train_df['entity_id'].to_list()) # %% train_df.to_csv('train.csv', index=False) ########################################## # %% # Load the JSON file data_path = '../esAppMod/infer.json' with open(data_path, 'r') as file: data = json.load(file) # Initialize an empty list to store the rows rows = [] # Loop through all entities in the JSON for entity_key, entity_data in data["data"].items(): mention = entity_data["mention"] entity_id = entity_data["entity_id"] entity_name = id2label[entity_id] # Add each mention and its entity_id to the rows list rows.append( { "mention": mention, "entity_id": entity_id, "entity_name": entity_name }) # Create a DataFrame from the rows test_df = pd.DataFrame(rows) test_class_set = (set(test_df['entity_id'].to_list())) # %% test_df.to_csv('test.csv', index=False) # %% # this shows that the training data can be found in the train set test_class_set - train_class_set # %%