2025-01-13 19:05:13 +09:00
|
|
|
# %%
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# import training file
|
2025-01-15 20:09:15 +09:00
|
|
|
data_path = '../esAppMod_data_import/train.csv'
|
|
|
|
# data_path = '../esAppMod_data_import/parent_train.csv'
|
2025-01-13 19:05:13 +09:00
|
|
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
|
|
|
|
|
|
|
|
|
|
# import test file
|
2025-01-15 20:09:15 +09:00
|
|
|
data_path = '../esAppMod_data_import/test.csv'
|
|
|
|
# data_path = '../esAppMod_data_import/parent_test.csv'
|
2025-01-13 19:05:13 +09:00
|
|
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
|
|
|
|
|
|
# import entity file
|
2025-01-15 20:09:15 +09:00
|
|
|
data_path = '../esAppMod_data_import/entity.csv'
|
2025-01-13 19:05:13 +09:00
|
|
|
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
|
|
id2label = {}
|
|
|
|
for _, row in entity_df.iterrows():
|
|
|
|
id2label[row['id']] = row['name']
|
|
|
|
|
2025-01-14 17:34:17 +09:00
|
|
|
train_df.sort_values(by=['entity_id']).to_markdown('out.md')
|
2025-01-13 19:05:13 +09:00
|
|
|
|
|
|
|
# %%
|
2025-01-15 20:09:15 +09:00
|
|
|
data_path = '../train/class_bert_augmentation/prediction/exports/result.csv'
|
2025-01-13 19:05:13 +09:00
|
|
|
prediction_df = pd.read_csv(data_path)
|
|
|
|
|
|
|
|
predicted_entity_list = []
|
|
|
|
for element in prediction_df['class_prediction']:
|
|
|
|
predicted_entity_list.append(id2label[element])
|
|
|
|
|
|
|
|
prediction_df['predicted_name'] = predicted_entity_list
|
|
|
|
new_df = pd.concat((test_df, prediction_df ), axis=1)
|
|
|
|
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
|
|
|
|
mismatch_df = new_df[mismatch_mask]
|
2025-01-14 17:34:17 +09:00
|
|
|
len(mismatch_df)
|
2025-01-13 19:05:13 +09:00
|
|
|
|
|
|
|
# %%
|
|
|
|
# print the top 10 offending classes
|
2025-01-15 20:09:15 +09:00
|
|
|
# mask1 = mismatch_df['entity_id'] != 434
|
|
|
|
# mask2 = mismatch_df['entity_id'] != 451
|
|
|
|
# mask3 = mismatch_df['entity_id'] != 452
|
|
|
|
# mask= mask1 & mask2 & mask3
|
|
|
|
# masked_df = mismatch_df[mask]
|
|
|
|
# print(masked_df['entity_id'].value_counts()[:10])
|
2025-01-13 19:05:13 +09:00
|
|
|
print(mismatch_df['entity_id'].value_counts()[:10])
|
2025-01-15 20:09:15 +09:00
|
|
|
masked_df = mismatch_df
|
2025-01-13 19:05:13 +09:00
|
|
|
|
2025-01-14 17:34:17 +09:00
|
|
|
|
2025-01-13 19:05:13 +09:00
|
|
|
# %%
|
|
|
|
# Convert the whole dataframe as a string and display
|
|
|
|
# print the mismatch_df
|
2025-01-15 20:09:15 +09:00
|
|
|
print(masked_df.sort_values(by=['entity_id']).to_markdown())
|
2025-01-13 19:05:13 +09:00
|
|
|
|
2025-01-14 17:34:17 +09:00
|
|
|
# %%
|
|
|
|
mismatch_df.to_csv('error.csv')
|
2025-01-13 19:05:13 +09:00
|
|
|
|
|
|
|
# %%
|
|
|
|
# let us see the test mentions
|
2025-01-14 17:34:17 +09:00
|
|
|
select_value = 268
|
2025-01-13 19:05:13 +09:00
|
|
|
select_mask = mismatch_df['entity_id'] == select_value
|
|
|
|
mismatch_df[select_mask]
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# let us see the train mentions
|
2025-01-15 20:09:15 +09:00
|
|
|
select_value = 130
|
2025-01-13 19:05:13 +09:00
|
|
|
select_mask = train_df['entity_id'] == select_value
|
|
|
|
train_df[select_mask]
|
|
|
|
|
|
|
|
|
|
|
|
|