domain_mapping/biomedical_data_import/original_data_processing.py

# %%
from collections import defaultdict

# %%
data_name = 'bc2gm' # and the other 3 names
train_path = 'test_dictionary.txt'
test_path = 'processed_test_refined'

# %%
vocab = defaultdict(set)
with open(f'../biomedical/{data_name}/{train_path}') as f:
    for line in f:
        term_list = line.strip().split('||')
        vocab[term_list[0]].add(term_list[1].lower())

cui_to_id, id_to_cui = {}, {}
vocab_entity_id_mentions = {}
for id, cui in enumerate(vocab):
    cui_to_id[cui] = id
    id_to_cui[id] = cui
for cui, mention in vocab.items():
    vocab_entity_id_mentions[cui_to_id[cui]] = mention

vocab_mentions, vocab_ids = [], []
for id, mentions in vocab_entity_id_mentions.items():
    vocab_mentions.extend(mentions)
    vocab_ids.extend([id]*len(mentions))

# %%
test_mentions, test_cuis = [], []
with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:
    for line in f:
        term_list = line.strip().split('||')
        test_cuis.append(term_list[-1])
        test_mentions.append(term_list[-2].lower())