# %% from collections import defaultdict # %% data_name = 'bc2gm' # and the other 3 names train_path = 'test_dictionary.txt' test_path = 'processed_test_refined' # %% vocab = defaultdict(set) with open(f'../biomedical/{data_name}/{train_path}') as f: for line in f: term_list = line.strip().split('||') vocab[term_list[0]].add(term_list[1].lower()) cui_to_id, id_to_cui = {}, {} vocab_entity_id_mentions = {} for id, cui in enumerate(vocab): cui_to_id[cui] = id id_to_cui[id] = cui for cui, mention in vocab.items(): vocab_entity_id_mentions[cui_to_id[cui]] = mention vocab_mentions, vocab_ids = [], [] for id, mentions in vocab_entity_id_mentions.items(): vocab_mentions.extend(mentions) vocab_ids.extend([id]*len(mentions)) # %% test_mentions, test_cuis = [], [] with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f: for line in f: term_list = line.strip().split('||') test_cuis.append(term_list[-1]) test_mentions.append(term_list[-2].lower())