37 lines
1.0 KiB
Python
37 lines
1.0 KiB
Python
# %%
|
|
from collections import defaultdict
|
|
|
|
# %%
|
|
data_name = 'bc2gm' # and the other 3 names
|
|
train_path = 'test_dictionary.txt'
|
|
test_path = 'processed_test_refined'
|
|
|
|
# %%
|
|
vocab = defaultdict(set)
|
|
with open(f'../biomedical/{data_name}/{train_path}') as f:
|
|
for line in f:
|
|
term_list = line.strip().split('||')
|
|
vocab[term_list[0]].add(term_list[1].lower())
|
|
|
|
cui_to_id, id_to_cui = {}, {}
|
|
vocab_entity_id_mentions = {}
|
|
for id, cui in enumerate(vocab):
|
|
cui_to_id[cui] = id
|
|
id_to_cui[id] = cui
|
|
for cui, mention in vocab.items():
|
|
vocab_entity_id_mentions[cui_to_id[cui]] = mention
|
|
|
|
vocab_mentions, vocab_ids = [], []
|
|
for id, mentions in vocab_entity_id_mentions.items():
|
|
vocab_mentions.extend(mentions)
|
|
vocab_ids.extend([id]*len(mentions))
|
|
|
|
# %%
|
|
test_mentions, test_cuis = [], []
|
|
with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:
|
|
for line in f:
|
|
term_list = line.strip().split('||')
|
|
test_cuis.append(term_list[-1])
|
|
test_mentions.append(term_list[-2].lower())
|
|
|