63 lines
1.5 KiB
Python
63 lines
1.5 KiB
Python
# %%
|
|
import pandas as pd
|
|
import re
|
|
|
|
# %%
|
|
# import training file
|
|
data_path = '../esAppMod_data_import/train.csv'
|
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
|
|
|
|
# import test file
|
|
data_path = '../esAppMod_data_import/test.csv'
|
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
|
|
# import entity file
|
|
data_path = '../esAppMod_data_import/entity.csv'
|
|
entity_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
id2label = {}
|
|
for _, row in entity_df.iterrows():
|
|
id2label[row['id']] = row['name']
|
|
|
|
|
|
# %%
|
|
train_df
|
|
# %%
|
|
|
|
def extract_acronym_mapping(names):
|
|
mapping = {}
|
|
for name in names:
|
|
# Find acronym in parentheses
|
|
match = re.search(r"\((\w+)\)", name)
|
|
if match:
|
|
acronym = match.group(1)
|
|
|
|
# Remove unrelated prepended terms
|
|
core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip()
|
|
|
|
# Add to dictionary
|
|
mapping[acronym] = core_term
|
|
return mapping
|
|
|
|
names = set(train_df['entity_name'].to_list())
|
|
|
|
# Extract mappings
|
|
acronym_mapping = extract_acronym_mapping(names)
|
|
print(acronym_mapping)
|
|
# %%
|
|
del acronym_mapping['E'] # too many false matches
|
|
acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()}
|
|
|
|
abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()}
|
|
term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()}
|
|
|
|
|
|
# %%
|
|
abbrev_to_term
|
|
# %%
|
|
term_to_abbrev
|
|
|
|
# %%
|
|
acronym_mapping
|
|
# %%
|