domain_mapping/analysis_biomedical/data_properties.py

59 lines
1002 B
Python
Raw Normal View History

# %%
import pandas as pd
# %%
#############################
# How much data
# data_path = '../biomedical_data_import/bc2gm_test.csv'
# data_path = '../biomedical_data_import/bc2gm_test.csv'
data_path = '../biomedical_data_import/bc5cdr-chemical_train.csv'
df = pd.read_csv(data_path)
len(df)
# %%
# %%
# bc2gm:
# train: 288939
# test: 1034
# %%
################################
# check for NA values
df[df['mention'].isna()]
# %%
##############################
# how many labels?
data_path = '../biomedical_data_import/bc2gm_test.csv'
df = pd.read_csv(data_path)
id_list = sorted(list(set(df['entity_id'].to_list())))
# %%
len(id_list)
# %%
for id in id_list:
if isinstance(id,int):
continue
else:
print(id)
# %%
# bc2gm:
# 61641 - holy shit
# %%
###############################
# max length
max_length = 0
for mention in df['mention']:
current_length = len(mention)
if current_length > max_length:
max_length = current_length
print(max_length)
# %%