86 lines
2.0 KiB
Python
86 lines
2.0 KiB
Python
# %%
|
|
import json
|
|
import pandas as pd
|
|
|
|
##########################################
|
|
# %%
|
|
# import entity information
|
|
|
|
# %%
|
|
data_path = 'entity.csv'
|
|
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
id2label = {}
|
|
for _, row in entity_df.iterrows():
|
|
id2label[row['id']] = row['name']
|
|
|
|
|
|
# Load the JSON file
|
|
data_path = '../esAppMod/train.json'
|
|
with open(data_path, 'r') as file:
|
|
data = json.load(file)
|
|
|
|
# Initialize an empty list to store the rows
|
|
rows = []
|
|
|
|
# Loop through all entities in the JSON
|
|
for entity_key, entity_data in data["data"].items():
|
|
mentions = entity_data["mentions"]
|
|
entity_id = entity_data["entity_id"]
|
|
entity_name = id2label[entity_id]
|
|
|
|
# Add each mention and its entity_id to the rows list
|
|
for mention in mentions:
|
|
rows.append(
|
|
{
|
|
"mention": mention,
|
|
"entity_id": entity_id,
|
|
"entity_name": entity_name
|
|
})
|
|
|
|
# Create a DataFrame from the rows
|
|
train_df = pd.DataFrame(rows)
|
|
|
|
train_class_set = set(train_df['entity_id'].to_list())
|
|
|
|
# %%
|
|
train_df.to_csv('train.csv', index=False)
|
|
##########################################
|
|
# %%
|
|
# Load the JSON file
|
|
data_path = '../esAppMod/infer.json'
|
|
with open(data_path, 'r') as file:
|
|
data = json.load(file)
|
|
|
|
# Initialize an empty list to store the rows
|
|
rows = []
|
|
|
|
# Loop through all entities in the JSON
|
|
for entity_key, entity_data in data["data"].items():
|
|
mention = entity_data["mention"]
|
|
entity_id = entity_data["entity_id"]
|
|
entity_name = id2label[entity_id]
|
|
|
|
# Add each mention and its entity_id to the rows list
|
|
rows.append(
|
|
{
|
|
"mention": mention,
|
|
"entity_id": entity_id,
|
|
"entity_name": entity_name
|
|
})
|
|
|
|
|
|
|
|
# Create a DataFrame from the rows
|
|
test_df = pd.DataFrame(rows)
|
|
|
|
test_class_set = (set(test_df['entity_id'].to_list()))
|
|
|
|
# %%
|
|
test_df.to_csv('test.csv', index=False)
|
|
|
|
# %%
|
|
# this shows that the training data can be found in the train set
|
|
test_class_set - train_class_set
|
|
|
|
# %%
|