135 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			135 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
# %%
 | 
						|
import pandas as pd
 | 
						|
from tqdm import tqdm
 | 
						|
import multiprocessing
 | 
						|
 | 
						|
# %%
 | 
						|
#########################
 | 
						|
# we first process training data
 | 
						|
def process_train_to_csv(data_path, output):
 | 
						|
    # data_path = '../esAppMod_data_import/parent_train.csv'
 | 
						|
    input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
 | 
						|
    input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',})
 | 
						|
 | 
						|
    # handle 'or' values in the number column
 | 
						|
    df = input_df.copy()
 | 
						|
    new_rows = []
 | 
						|
    for idx,row in df.iterrows():
 | 
						|
        index = row['entity_id']
 | 
						|
        mention = row['mention']
 | 
						|
 | 
						|
        # omit nan values
 | 
						|
        if row['mention'] == 'NaN' or pd.isna(row['mention']):
 | 
						|
            df = df.drop(index=[idx])
 | 
						|
            continue
 | 
						|
 | 
						|
        # handle possible multiple indices in index field
 | 
						|
        if '|' in row['entity_id']:
 | 
						|
            # print(row[0])
 | 
						|
            df = df.drop(index=[idx])
 | 
						|
            index_list = index.split('|')
 | 
						|
 | 
						|
            for new_index in index_list:
 | 
						|
                element = {
 | 
						|
                    'entity_id': new_index,
 | 
						|
                    'mention': mention,
 | 
						|
                }
 | 
						|
                new_rows.append(element)
 | 
						|
                
 | 
						|
    df_new = pd.DataFrame(new_rows, columns=df.columns)
 | 
						|
    df = pd.concat([df, df_new], ignore_index=True)
 | 
						|
    df = df.reset_index(drop=True)
 | 
						|
 | 
						|
    df.to_csv(output, index=False)
 | 
						|
 | 
						|
 | 
						|
# %%
 | 
						|
name_list =[
 | 
						|
    ('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'),
 | 
						|
    ('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'),
 | 
						|
    ('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'),
 | 
						|
    ('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'),
 | 
						|
]
 | 
						|
 | 
						|
# for data_path, output in name_list:
 | 
						|
#     process_train_to_csv(data_path, output)
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    # Create a pool of workers
 | 
						|
    num_workers = 4  # set number of cpus to use
 | 
						|
    with multiprocessing.Pool(num_workers) as pool:
 | 
						|
        # starmap
 | 
						|
        # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
 | 
						|
        pool.starmap(process_train_to_csv, name_list)
 | 
						|
 | 
						|
 | 
						|
# %%
 | 
						|
#################################################
 | 
						|
# process test data
 | 
						|
 | 
						|
def is_int_string(s):
 | 
						|
    try:
 | 
						|
        int(s)
 | 
						|
        return True
 | 
						|
    except ValueError:
 | 
						|
        return False
 | 
						|
 | 
						|
def process_test_to_csv(data_path, output):
 | 
						|
    # data_path = '../esAppMod_data_import/parent_train.csv'
 | 
						|
    input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
 | 
						|
    input_df = input_df.drop(columns=[0, 1, 2])
 | 
						|
    input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'})
 | 
						|
 | 
						|
    # handle 'or' values in the number column
 | 
						|
    df = input_df.copy()
 | 
						|
    new_rows = []
 | 
						|
    for idx,row in df.iterrows():
 | 
						|
 | 
						|
        # handle possible multiple indices
 | 
						|
        if '|' in row['entity_id']:
 | 
						|
            index = row['entity_id']
 | 
						|
            mention = row['mention']
 | 
						|
            df = df.drop(index=[idx])
 | 
						|
            index_list = index.split('|')
 | 
						|
 | 
						|
            for new_index in index_list:
 | 
						|
                element = {
 | 
						|
                    'entity_id': new_index,
 | 
						|
                    'mention': mention,
 | 
						|
                }
 | 
						|
                new_rows.append(element)
 | 
						|
                
 | 
						|
    df_new = pd.DataFrame(new_rows, columns=df.columns)
 | 
						|
    df = pd.concat([df, df_new], ignore_index=True)
 | 
						|
    df = df.reset_index(drop=True)
 | 
						|
 | 
						|
    # do some cleanup
 | 
						|
    df['entity_id'].isna()
 | 
						|
 | 
						|
    df.to_csv(output, index=False)
 | 
						|
 | 
						|
 | 
						|
# %%
 | 
						|
name_list =[
 | 
						|
    ('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'),
 | 
						|
    ('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'),
 | 
						|
    ('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'),
 | 
						|
    ('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'),
 | 
						|
]
 | 
						|
 | 
						|
# for data_path, output in name_list:
 | 
						|
#     process_test_to_csv(data_path, output)
 | 
						|
if __name__ == "__main__":
 | 
						|
    # Create a pool of workers
 | 
						|
    num_workers = 4  # set number of cpus to use
 | 
						|
    with multiprocessing.Pool(num_workers) as pool:
 | 
						|
        # starmap
 | 
						|
        # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
 | 
						|
        pool.starmap(process_test_to_csv, name_list)
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# %%
 | 
						|
 | 
						|
# %%
 |