17 lines
421 B
Python
17 lines
421 B
Python
# %%
|
|
from transformers import AutoTokenizer
|
|
import pandas as pd
|
|
|
|
|
|
data_path = '../biomedical_data_import/bc2gm_train.csv'
|
|
df = pd.DataFrame(data_path)
|
|
|
|
# Load the tokenizer (e.g., BERT tokenizer)
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
# %%
|
|
# Calculate token lengths
|
|
df['token_length'] = df['mention'].apply(lambda x: len(tokenizer.tokenize(x)))
|
|
|
|
# Display the dataset with token lengths
|
|
print(df) |