135 lines
4.1 KiB
Python
135 lines
4.1 KiB
Python
# %%
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
import multiprocessing
|
|
|
|
# %%
|
|
#########################
|
|
# we first process training data
|
|
def process_train_to_csv(data_path, output):
|
|
# data_path = '../esAppMod_data_import/parent_train.csv'
|
|
input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
|
|
input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',})
|
|
|
|
# handle 'or' values in the number column
|
|
df = input_df.copy()
|
|
new_rows = []
|
|
for idx,row in df.iterrows():
|
|
index = row['entity_id']
|
|
mention = row['mention']
|
|
|
|
# omit nan values
|
|
if row['mention'] == 'NaN' or pd.isna(row['mention']):
|
|
df = df.drop(index=[idx])
|
|
continue
|
|
|
|
# handle possible multiple indices in index field
|
|
if '|' in row['entity_id']:
|
|
# print(row[0])
|
|
df = df.drop(index=[idx])
|
|
index_list = index.split('|')
|
|
|
|
for new_index in index_list:
|
|
element = {
|
|
'entity_id': new_index,
|
|
'mention': mention,
|
|
}
|
|
new_rows.append(element)
|
|
|
|
df_new = pd.DataFrame(new_rows, columns=df.columns)
|
|
df = pd.concat([df, df_new], ignore_index=True)
|
|
df = df.reset_index(drop=True)
|
|
|
|
df.to_csv(output, index=False)
|
|
|
|
|
|
# %%
|
|
name_list =[
|
|
('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'),
|
|
('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'),
|
|
('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'),
|
|
('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'),
|
|
]
|
|
|
|
# for data_path, output in name_list:
|
|
# process_train_to_csv(data_path, output)
|
|
|
|
if __name__ == "__main__":
|
|
# Create a pool of workers
|
|
num_workers = 4 # set number of cpus to use
|
|
with multiprocessing.Pool(num_workers) as pool:
|
|
# starmap
|
|
# an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
|
|
pool.starmap(process_train_to_csv, name_list)
|
|
|
|
|
|
# %%
|
|
#################################################
|
|
# process test data
|
|
|
|
def is_int_string(s):
|
|
try:
|
|
int(s)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
def process_test_to_csv(data_path, output):
|
|
# data_path = '../esAppMod_data_import/parent_train.csv'
|
|
input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
|
|
input_df = input_df.drop(columns=[0, 1, 2])
|
|
input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'})
|
|
|
|
# handle 'or' values in the number column
|
|
df = input_df.copy()
|
|
new_rows = []
|
|
for idx,row in df.iterrows():
|
|
|
|
# handle possible multiple indices
|
|
if '|' in row['entity_id']:
|
|
index = row['entity_id']
|
|
mention = row['mention']
|
|
df = df.drop(index=[idx])
|
|
index_list = index.split('|')
|
|
|
|
for new_index in index_list:
|
|
element = {
|
|
'entity_id': new_index,
|
|
'mention': mention,
|
|
}
|
|
new_rows.append(element)
|
|
|
|
df_new = pd.DataFrame(new_rows, columns=df.columns)
|
|
df = pd.concat([df, df_new], ignore_index=True)
|
|
df = df.reset_index(drop=True)
|
|
|
|
# do some cleanup
|
|
df['entity_id'].isna()
|
|
|
|
df.to_csv(output, index=False)
|
|
|
|
|
|
# %%
|
|
name_list =[
|
|
('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'),
|
|
('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'),
|
|
('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'),
|
|
('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'),
|
|
]
|
|
|
|
# for data_path, output in name_list:
|
|
# process_test_to_csv(data_path, output)
|
|
if __name__ == "__main__":
|
|
# Create a pool of workers
|
|
num_workers = 4 # set number of cpus to use
|
|
with multiprocessing.Pool(num_workers) as pool:
|
|
# starmap
|
|
# an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
|
|
pool.starmap(process_test_to_csv, name_list)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# %%
|