domain_mapping/reference_code/dynamic_dataset_generation.py

189 lines
5.5 KiB
Python
Raw Permalink Normal View History

# why?
# the existing huggingface library does not allow for flexibility in changing
# the training data between epochs
# this code example illustrates the use of dataset regeneration to make changes
# to the training data between epochs
# %%
from torch.utils.data import Dataset, DataLoader
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import re
import random
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments
)
import evaluate
import numpy as np
import pandas as pd
from functools import partial
# import matplotlib.pyplot as plt
torch.set_float32_matmul_precision('high')
def set_seed(seed):
"""
Set the random seed for reproducibility.
"""
random.seed(seed) # Python random module
np.random.seed(seed) # NumPy random
torch.manual_seed(seed) # PyTorch CPU
torch.cuda.manual_seed(seed) # PyTorch GPU
torch.cuda.manual_seed_all(seed) # If using multiple GPUs
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility
set_seed(42)
# %%
# PARAMETERS
SAMPLES=5
# %%
# import training file
data_path = '../../esAppMod_data_import/train.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
entity_ids = df['entity_id'].to_list()
target_id_list = sorted(list(set(entity_ids)))
id2label = {}
label2id = {}
for idx, val in enumerate(target_id_list):
id2label[idx] = val
label2id[val] = idx
# %%
# we want to sample n samples from each class
# sample_size refers to the number of samples per class
def sample_from_df(df, sample_size_per_class=5):
sampled_df = (df.groupby( "entity_id")[['entity_id', 'mention']] # explicit give column names
.apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
.reset_index(drop=True))
return sampled_df
# %%
# augment whole dataset
# for now, we just return the same df
def augment_data(df):
return df
# %%
class DynamicDataset(Dataset):
def __init__(self, df, sample_size_per_class, tokenizer):
"""
Args:
df (pd.DataFrame): Original DataFrame with class (id) and data columns.
sample_size_per_class (int): Number of samples to draw per class for each epoch.
"""
self.df = df
self.sample_size_per_class = sample_size_per_class
self.tokenizer = tokenizer
self.current_data = None
self.regenerate_data() # Generate the initial dataset
def regenerate_data(self):
"""
Generate a new sampled dataset for the current epoch.
dynamic callback function to regenerate data each time we call this
method, it updates the current_data we can:
- re-sample the dataframe for a new set of n_samples
- generate fresh augmentations this effectively
This allows us to re-sample and re-augment at the start of each epoch
"""
# Sample `sample_size_per_class` rows per class
sampled_df = sample_from_df(self.df, self.sample_size_per_class)
# perform future augmentations here
sampled_df = augment_data(sampled_df)
# perform tokenization here
# Batch tokenize the entire column of data
tokenized_batch = self.tokenizer(
sampled_df["mention"].to_list(), # Pass all text data at once
truncation=True,
# return_tensors="pt" # disabled because pt requires equal length tensors
)
# Store the tokenized data with labels
# we need to convert to torch tensors so that subsequent 'pad_sequence'
# and 'stack' operations can work
self.current_data = [
{
"input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
"attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
"labels": torch.tensor(sampled_df.iloc[i]["entity_id"]) # Include the label
}
for i in range(len(sampled_df))
]
def __len__(self):
return len(self.current_data)
def __getitem__(self, idx):
return self.current_data[idx]
# %%
# Dynamic dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=False)
lean_df = df.drop(columns=['entity_name'])
dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
# %%
# custom tokenization
# %%
# Example usage of dynamic dataset
sample = dynamic_dataset[0]
print(sample)
# %%
def custom_collate_fn(batch):
# Dynamically pad tensors to the longest sequence in the batch
input_ids = [item["input_ids"] for item in batch]
attention_masks = [item["attention_mask"] for item in batch]
labels = torch.stack([item["labels"] for item in batch])
# Pad inputs to the same length
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
return {
"input_ids": input_ids,
"attention_mask": attention_masks,
"labels": labels
}
dataloader = DataLoader(
dynamic_dataset,
batch_size=32,
collate_fn=custom_collate_fn
)
# %%