domain_mapping/biomedical_data_import/process_to_df.py

135 lines
4.1 KiB
Python
Raw Permalink Normal View History

# %%
import pandas as pd
from tqdm import tqdm
import multiprocessing
# %%
#########################
# we first process training data
def process_train_to_csv(data_path, output):
# data_path = '../esAppMod_data_import/parent_train.csv'
input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',})
# handle 'or' values in the number column
df = input_df.copy()
new_rows = []
for idx,row in df.iterrows():
index = row['entity_id']
mention = row['mention']
# omit nan values
if row['mention'] == 'NaN' or pd.isna(row['mention']):
df = df.drop(index=[idx])
continue
# handle possible multiple indices in index field
if '|' in row['entity_id']:
# print(row[0])
df = df.drop(index=[idx])
index_list = index.split('|')
for new_index in index_list:
element = {
'entity_id': new_index,
'mention': mention,
}
new_rows.append(element)
df_new = pd.DataFrame(new_rows, columns=df.columns)
df = pd.concat([df, df_new], ignore_index=True)
df = df.reset_index(drop=True)
df.to_csv(output, index=False)
# %%
name_list =[
('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'),
('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'),
('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'),
('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'),
]
# for data_path, output in name_list:
# process_train_to_csv(data_path, output)
if __name__ == "__main__":
# Create a pool of workers
num_workers = 4 # set number of cpus to use
with multiprocessing.Pool(num_workers) as pool:
# starmap
# an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
pool.starmap(process_train_to_csv, name_list)
# %%
#################################################
# process test data
def is_int_string(s):
try:
int(s)
return True
except ValueError:
return False
def process_test_to_csv(data_path, output):
# data_path = '../esAppMod_data_import/parent_train.csv'
input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
input_df = input_df.drop(columns=[0, 1, 2])
input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'})
# handle 'or' values in the number column
df = input_df.copy()
new_rows = []
for idx,row in df.iterrows():
# handle possible multiple indices
if '|' in row['entity_id']:
index = row['entity_id']
mention = row['mention']
df = df.drop(index=[idx])
index_list = index.split('|')
for new_index in index_list:
element = {
'entity_id': new_index,
'mention': mention,
}
new_rows.append(element)
df_new = pd.DataFrame(new_rows, columns=df.columns)
df = pd.concat([df, df_new], ignore_index=True)
df = df.reset_index(drop=True)
# do some cleanup
df['entity_id'].isna()
df.to_csv(output, index=False)
# %%
name_list =[
('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'),
('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'),
('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'),
('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'),
]
# for data_path, output in name_list:
# process_test_to_csv(data_path, output)
if __name__ == "__main__":
# Create a pool of workers
num_workers = 4 # set number of cpus to use
with multiprocessing.Pool(num_workers) as pool:
# starmap
# an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
pool.starmap(process_test_to_csv, name_list)
# %%
# %%