domain_mapping/biomedical_data_import/original_data_processing.py

37 lines
1.0 KiB
Python
Raw Permalink Normal View History

# %%
from collections import defaultdict
# %%
data_name = 'bc2gm' # and the other 3 names
train_path = 'test_dictionary.txt'
test_path = 'processed_test_refined'
# %%
vocab = defaultdict(set)
with open(f'../biomedical/{data_name}/{train_path}') as f:
for line in f:
term_list = line.strip().split('||')
vocab[term_list[0]].add(term_list[1].lower())
cui_to_id, id_to_cui = {}, {}
vocab_entity_id_mentions = {}
for id, cui in enumerate(vocab):
cui_to_id[cui] = id
id_to_cui[id] = cui
for cui, mention in vocab.items():
vocab_entity_id_mentions[cui_to_id[cui]] = mention
vocab_mentions, vocab_ids = [], []
for id, mentions in vocab_entity_id_mentions.items():
vocab_mentions.extend(mentions)
vocab_ids.extend([id]*len(mentions))
# %%
test_mentions, test_cuis = [], []
with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:
for line in f:
term_list = line.strip().split('||')
test_cuis.append(term_list[-1])
test_mentions.append(term_list[-2].lower())