domain_mapping/biomedical_data_import/original_data_processing.py

# %%
from collections import defaultdict

# %%
data_name = 'bc2gm' # and the other 3 names
train_path = 'test_dictionary.txt'
test_path = 'processed_test_refined'

# %%
vocab = defaultdict(set)
with open(f'../biomedical/{data_name}/{train_path}') as f:
    for line in f:            
        term_list = line.strip().split('||')
        vocab[term_list[0]].add(term_list[1].lower())

cui_to_id, id_to_cui = {}, {}
vocab_entity_id_mentions = {}
for id, cui in enumerate(vocab):
    cui_to_id[cui] = id
    id_to_cui[id] = cui
for cui, mention in vocab.items():
    vocab_entity_id_mentions[cui_to_id[cui]] = mention

vocab_mentions, vocab_ids = [], []
for id, mentions in vocab_entity_id_mentions.items():
    vocab_mentions.extend(mentions)
    vocab_ids.extend([id]*len(mentions))

# %%
test_mentions, test_cuis = [], []
with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:
    for line in f:            
        term_list = line.strip().split('||')
        test_cuis.append(term_list[-1])
        test_mentions.append(term_list[-2].lower())
Implemented dynamic data re-sampling at each epoch 2025-01-16 19:41:03 +09:00			`# %%`
			`from collections import defaultdict`

			`# %%`
			`data_name = 'bc2gm' # and the other 3 names`
			`train_path = 'test_dictionary.txt'`
			`test_path = 'processed_test_refined'`

			`# %%`
			`vocab = defaultdict(set)`
			`with open(f'../biomedical/{data_name}/{train_path}') as f:`
			`for line in f:`
			`term_list = line.strip().split('\|\|')`
			`vocab[term_list[0]].add(term_list[1].lower())`

			`cui_to_id, id_to_cui = {}, {}`
			`vocab_entity_id_mentions = {}`
			`for id, cui in enumerate(vocab):`
			`cui_to_id[cui] = id`
			`id_to_cui[id] = cui`
			`for cui, mention in vocab.items():`
			`vocab_entity_id_mentions[cui_to_id[cui]] = mention`

			`vocab_mentions, vocab_ids = [], []`
			`for id, mentions in vocab_entity_id_mentions.items():`
			`vocab_mentions.extend(mentions)`
			`vocab_ids.extend([id]*len(mentions))`

			`# %%`
			`test_mentions, test_cuis = [], []`
			`with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:`
			`for line in f:`
			`term_list = line.strip().split('\|\|')`
			`test_cuis.append(term_list[-1])`
			`test_mentions.append(term_list[-2].lower())`