# %% import pandas as pd from tqdm import tqdm import multiprocessing # %% ######################### # we first process training data def process_train_to_csv(data_path, output): # data_path = '../esAppMod_data_import/parent_train.csv' input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None) input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',}) # handle 'or' values in the number column df = input_df.copy() new_rows = [] for idx,row in df.iterrows(): index = row['entity_id'] mention = row['mention'] # omit nan values if row['mention'] == 'NaN' or pd.isna(row['mention']): df = df.drop(index=[idx]) continue # handle possible multiple indices in index field if '|' in row['entity_id']: # print(row[0]) df = df.drop(index=[idx]) index_list = index.split('|') for new_index in index_list: element = { 'entity_id': new_index, 'mention': mention, } new_rows.append(element) df_new = pd.DataFrame(new_rows, columns=df.columns) df = pd.concat([df, df_new], ignore_index=True) df = df.reset_index(drop=True) df.to_csv(output, index=False) # %% name_list =[ ('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'), ('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'), ('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'), ('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'), ] # for data_path, output in name_list: # process_train_to_csv(data_path, output) if __name__ == "__main__": # Create a pool of workers num_workers = 4 # set number of cpus to use with multiprocessing.Pool(num_workers) as pool: # starmap # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)]. pool.starmap(process_train_to_csv, name_list) # %% ################################################# # process test data def is_int_string(s): try: int(s) return True except ValueError: return False def process_test_to_csv(data_path, output): # data_path = '../esAppMod_data_import/parent_train.csv' input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None) input_df = input_df.drop(columns=[0, 1, 2]) input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'}) # handle 'or' values in the number column df = input_df.copy() new_rows = [] for idx,row in df.iterrows(): # handle possible multiple indices if '|' in row['entity_id']: index = row['entity_id'] mention = row['mention'] df = df.drop(index=[idx]) index_list = index.split('|') for new_index in index_list: element = { 'entity_id': new_index, 'mention': mention, } new_rows.append(element) df_new = pd.DataFrame(new_rows, columns=df.columns) df = pd.concat([df, df_new], ignore_index=True) df = df.reset_index(drop=True) # do some cleanup df['entity_id'].isna() df.to_csv(output, index=False) # %% name_list =[ ('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'), ('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'), ('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'), ('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'), ] # for data_path, output in name_list: # process_test_to_csv(data_path, output) if __name__ == "__main__": # Create a pool of workers num_workers = 4 # set number of cpus to use with multiprocessing.Pool(num_workers) as pool: # starmap # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)]. pool.starmap(process_test_to_csv, name_list) # %% # %%