domain_mapping/esAppMod_data_import/process_data.py

86 lines
2.0 KiB
Python
Raw Permalink Normal View History

# %%
import json
import pandas as pd
##########################################
# %%
# import entity information
# %%
data_path = 'entity.csv'
entity_df = pd.read_csv(data_path, skipinitialspace=True)
id2label = {}
for _, row in entity_df.iterrows():
id2label[row['id']] = row['name']
# Load the JSON file
data_path = '../esAppMod/train.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# Loop through all entities in the JSON
for entity_key, entity_data in data["data"].items():
mentions = entity_data["mentions"]
entity_id = entity_data["entity_id"]
entity_name = id2label[entity_id]
# Add each mention and its entity_id to the rows list
for mention in mentions:
rows.append(
{
"mention": mention,
"entity_id": entity_id,
"entity_name": entity_name
})
# Create a DataFrame from the rows
train_df = pd.DataFrame(rows)
train_class_set = set(train_df['entity_id'].to_list())
# %%
train_df.to_csv('train.csv', index=False)
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/infer.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# Loop through all entities in the JSON
for entity_key, entity_data in data["data"].items():
mention = entity_data["mention"]
entity_id = entity_data["entity_id"]
entity_name = id2label[entity_id]
# Add each mention and its entity_id to the rows list
rows.append(
{
"mention": mention,
"entity_id": entity_id,
"entity_name": entity_name
})
# Create a DataFrame from the rows
test_df = pd.DataFrame(rows)
test_class_set = (set(test_df['entity_id'].to_list()))
# %%
test_df.to_csv('test.csv', index=False)
# %%
# this shows that the training data can be found in the train set
test_class_set - train_class_set
# %%