domain_mapping/esAppMod_data_import/process_data.py

# %%
import json
import pandas as pd

##########################################
# %%
# import entity information

# %%
data_path = 'entity.csv'
entity_df = pd.read_csv(data_path, skipinitialspace=True)
id2label = {}
for _, row in entity_df.iterrows():
    id2label[row['id']] = row['name']


# Load the JSON file
data_path = '../esAppMod/train.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Initialize an empty list to store the rows
rows = []

# Loop through all entities in the JSON
for entity_key, entity_data in data["data"].items():
    mentions = entity_data["mentions"]
    entity_id = entity_data["entity_id"]
    entity_name = id2label[entity_id]
    
    # Add each mention and its entity_id to the rows list
    for mention in mentions:
        rows.append(
            {
                "mention": mention,
                "entity_id": entity_id,
                "entity_name": entity_name
            })

# Create a DataFrame from the rows
train_df = pd.DataFrame(rows)

train_class_set = set(train_df['entity_id'].to_list())

# %%
train_df.to_csv('train.csv', index=False)
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/infer.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Initialize an empty list to store the rows
rows = []

# Loop through all entities in the JSON
for entity_key, entity_data in data["data"].items():
    mention = entity_data["mention"]
    entity_id = entity_data["entity_id"]
    entity_name = id2label[entity_id]
    
    # Add each mention and its entity_id to the rows list
    rows.append(
        {
            "mention": mention,
            "entity_id": entity_id,
            "entity_name": entity_name
        })


# Create a DataFrame from the rows
test_df = pd.DataFrame(rows)

test_class_set = (set(test_df['entity_id'].to_list()))

# %%
test_df.to_csv('test.csv', index=False)

# %%
# this shows that the training data can be found in the train set
test_class_set - train_class_set 

# %%
First commit - added classification-based mapping for esAppMod data 2025-01-13 19:05:13 +09:00			`# %%`
			`import json`
			`import pandas as pd`

			`##########################################`
			`# %%`
			`# import entity information`

			`# %%`
			`data_path = 'entity.csv'`
			`entity_df = pd.read_csv(data_path, skipinitialspace=True)`
			`id2label = {}`
			`for _, row in entity_df.iterrows():`
			`id2label[row['id']] = row['name']`


			`# Load the JSON file`
			`data_path = '../esAppMod/train.json'`
			`with open(data_path, 'r') as file:`
			`data = json.load(file)`

			`# Initialize an empty list to store the rows`
			`rows = []`

			`# Loop through all entities in the JSON`
			`for entity_key, entity_data in data["data"].items():`
			`mentions = entity_data["mentions"]`
			`entity_id = entity_data["entity_id"]`
			`entity_name = id2label[entity_id]`

			`# Add each mention and its entity_id to the rows list`
			`for mention in mentions:`
			`rows.append(`
			`{`
			`"mention": mention,`
			`"entity_id": entity_id,`
			`"entity_name": entity_name`
			`})`

			`# Create a DataFrame from the rows`
			`train_df = pd.DataFrame(rows)`

			`train_class_set = set(train_df['entity_id'].to_list())`

			`# %%`
			`train_df.to_csv('train.csv', index=False)`
			`##########################################`
			`# %%`
			`# Load the JSON file`
			`data_path = '../esAppMod/infer.json'`
			`with open(data_path, 'r') as file:`
			`data = json.load(file)`

			`# Initialize an empty list to store the rows`
			`rows = []`

			`# Loop through all entities in the JSON`
			`for entity_key, entity_data in data["data"].items():`
			`mention = entity_data["mention"]`
			`entity_id = entity_data["entity_id"]`
			`entity_name = id2label[entity_id]`

			`# Add each mention and its entity_id to the rows list`
			`rows.append(`
			`{`
			`"mention": mention,`
			`"entity_id": entity_id,`
			`"entity_name": entity_name`
			`})`



			`# Create a DataFrame from the rows`
			`test_df = pd.DataFrame(rows)`

			`test_class_set = (set(test_df['entity_id'].to_list()))`

			`# %%`
			`test_df.to_csv('test.csv', index=False)`

			`# %%`
			`# this shows that the training data can be found in the train set`
			`test_class_set - train_class_set`

			`# %%`