# %% from transformers import AutoTokenizer import pandas as pd data_path = '../biomedical_data_import/bc2gm_train.csv' df = pd.DataFrame(data_path) # Load the tokenizer (e.g., BERT tokenizer) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # %% # Calculate token lengths df['token_length'] = df['mention'].apply(lambda x: len(tokenizer.tokenize(x))) # Display the dataset with token lengths print(df)