domain_mapping/analysis_biomedical/measure_tokenization_length.py

17 lines
421 B
Python

# %%
from transformers import AutoTokenizer
import pandas as pd
data_path = '../biomedical_data_import/bc2gm_train.csv'
df = pd.DataFrame(data_path)
# Load the tokenizer (e.g., BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# %%
# Calculate token lengths
df['token_length'] = df['mention'].apply(lambda x: len(tokenizer.tokenize(x)))
# Display the dataset with token lengths
print(df)