domain_mapping/analysis/label_acronym.py

63 lines
1.5 KiB
Python

# %%
import pandas as pd
import re
# %%
# import training file
data_path = '../esAppMod_data_import/train.csv'
train_df = pd.read_csv(data_path, skipinitialspace=True)
# import test file
data_path = '../esAppMod_data_import/test.csv'
test_df = pd.read_csv(data_path, skipinitialspace=True)
# import entity file
data_path = '../esAppMod_data_import/entity.csv'
entity_df = pd.read_csv(data_path, skipinitialspace=True)
id2label = {}
for _, row in entity_df.iterrows():
id2label[row['id']] = row['name']
# %%
train_df
# %%
def extract_acronym_mapping(names):
mapping = {}
for name in names:
# Find acronym in parentheses
match = re.search(r"\((\w+)\)", name)
if match:
acronym = match.group(1)
# Remove unrelated prepended terms
core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip()
# Add to dictionary
mapping[acronym] = core_term
return mapping
names = set(train_df['entity_name'].to_list())
# Extract mappings
acronym_mapping = extract_acronym_mapping(names)
print(acronym_mapping)
# %%
del acronym_mapping['E'] # too many false matches
acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()}
abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()}
term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()}
# %%
abbrev_to_term
# %%
term_to_abbrev
# %%
acronym_mapping
# %%