59 lines
1002 B
Python
59 lines
1002 B
Python
|
# %%
|
||
|
import pandas as pd
|
||
|
|
||
|
# %%
|
||
|
#############################
|
||
|
# How much data
|
||
|
# data_path = '../biomedical_data_import/bc2gm_test.csv'
|
||
|
# data_path = '../biomedical_data_import/bc2gm_test.csv'
|
||
|
data_path = '../biomedical_data_import/bc5cdr-chemical_train.csv'
|
||
|
df = pd.read_csv(data_path)
|
||
|
len(df)
|
||
|
|
||
|
# %%
|
||
|
|
||
|
# %%
|
||
|
# bc2gm:
|
||
|
# train: 288939
|
||
|
# test: 1034
|
||
|
|
||
|
# %%
|
||
|
################################
|
||
|
# check for NA values
|
||
|
df[df['mention'].isna()]
|
||
|
|
||
|
|
||
|
|
||
|
# %%
|
||
|
##############################
|
||
|
# how many labels?
|
||
|
data_path = '../biomedical_data_import/bc2gm_test.csv'
|
||
|
df = pd.read_csv(data_path)
|
||
|
|
||
|
id_list = sorted(list(set(df['entity_id'].to_list())))
|
||
|
|
||
|
# %%
|
||
|
len(id_list)
|
||
|
|
||
|
# %%
|
||
|
for id in id_list:
|
||
|
if isinstance(id,int):
|
||
|
continue
|
||
|
else:
|
||
|
print(id)
|
||
|
# %%
|
||
|
# bc2gm:
|
||
|
# 61641 - holy shit
|
||
|
|
||
|
# %%
|
||
|
###############################
|
||
|
# max length
|
||
|
max_length = 0
|
||
|
for mention in df['mention']:
|
||
|
current_length = len(mention)
|
||
|
if current_length > max_length:
|
||
|
max_length = current_length
|
||
|
print(max_length)
|
||
|
|
||
|
# %%
|