# %% import pandas as pd import re # %% # import training file data_path = '../esAppMod_data_import/train.csv' train_df = pd.read_csv(data_path, skipinitialspace=True) # import test file data_path = '../esAppMod_data_import/test.csv' test_df = pd.read_csv(data_path, skipinitialspace=True) # import entity file data_path = '../esAppMod_data_import/entity.csv' entity_df = pd.read_csv(data_path, skipinitialspace=True) id2label = {} for _, row in entity_df.iterrows(): id2label[row['id']] = row['name'] # %% train_df # %% def extract_acronym_mapping(names): mapping = {} for name in names: # Find acronym in parentheses match = re.search(r"\((\w+)\)", name) if match: acronym = match.group(1) # Remove unrelated prepended terms core_term = re.sub(r"^([\w\s]+)\s*\(\w+\)$", r"\1", name).strip() # Add to dictionary mapping[acronym] = core_term return mapping names = set(train_df['entity_name'].to_list()) # Extract mappings acronym_mapping = extract_acronym_mapping(names) print(acronym_mapping) # %% del acronym_mapping['E'] # too many false matches acronym_mapping = {key.lower():value.lower() for key, value in acronym_mapping.items()} abbrev_to_term = {rf'\b{key}\b': value for key, value in acronym_mapping.items()} term_to_abbrev = {rf'\b{value}\b': key for key, value in acronym_mapping.items()} # %% abbrev_to_term # %% term_to_abbrev # %% acronym_mapping # %%