# %% import pandas as pd # %% ############################# # How much data # data_path = '../biomedical_data_import/bc2gm_test.csv' # data_path = '../biomedical_data_import/bc2gm_test.csv' data_path = '../biomedical_data_import/bc5cdr-chemical_train.csv' df = pd.read_csv(data_path) len(df) # %% # %% # bc2gm: # train: 288939 # test: 1034 # %% ################################ # check for NA values df[df['mention'].isna()] # %% ############################## # how many labels? data_path = '../biomedical_data_import/bc2gm_test.csv' df = pd.read_csv(data_path) id_list = sorted(list(set(df['entity_id'].to_list()))) # %% len(id_list) # %% for id in id_list: if isinstance(id,int): continue else: print(id) # %% # bc2gm: # 61641 - holy shit # %% ############################### # max length max_length = 0 for mention in df['mention']: current_length = len(mention) if current_length > max_length: max_length = current_length print(max_length) # %%