domain_mapping/reference_code/dynamic_dataset_generation.py

# why?
# the existing huggingface library does not allow for flexibility in changing
# the training data between epochs

# this code example illustrates the use of dataset regeneration to make changes
# to the training data between epochs
# %%
from torch.utils.data import Dataset, DataLoader

# from datasets import load_from_disk
import os

os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

import re
import random

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
)
import evaluate
import numpy as np
import pandas as pd
from functools import partial
# import matplotlib.pyplot as plt


torch.set_float32_matmul_precision('high')

def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility

set_seed(42)

# %%
# PARAMETERS
SAMPLES=5

# %%
# import training file
data_path = '../../esAppMod_data_import/train.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
entity_ids = df['entity_id'].to_list()
target_id_list = sorted(list(set(entity_ids)))

id2label = {}
label2id = {}
for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx

# %%
# we want to sample n samples from each class
# sample_size refers to the number of samples per class
def sample_from_df(df, sample_size_per_class=5):
    sampled_df = (df.groupby( "entity_id")[['entity_id', 'mention']] # explicit give column names
    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
    .reset_index(drop=True))

    return sampled_df


# %%
# augment whole dataset
# for now, we just return the same df
def augment_data(df):
    return df

# %%
class DynamicDataset(Dataset):
    def __init__(self, df, sample_size_per_class, tokenizer):
        """
        Args:
            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
            sample_size_per_class (int): Number of samples to draw per class for each epoch.
        """
        self.df = df
        self.sample_size_per_class = sample_size_per_class
        self.tokenizer = tokenizer
        self.current_data = None
        self.regenerate_data()  # Generate the initial dataset

    def regenerate_data(self):
        """
        Generate a new sampled dataset for the current epoch.

        dynamic callback function to regenerate data each time we call this
        method, it updates the current_data we can: 
            
        - re-sample the dataframe for a new set of n_samples 
        - generate fresh augmentations this effectively

        This allows us to re-sample and re-augment at the start of each epoch
        """
        # Sample `sample_size_per_class` rows per class
        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
        
        # perform future augmentations here
        sampled_df = augment_data(sampled_df)

        # perform tokenization here
        # Batch tokenize the entire column of data
        tokenized_batch = self.tokenizer(
            sampled_df["mention"].to_list(),  # Pass all text data at once
            truncation=True,
            # return_tensors="pt"  # disabled because pt requires equal length tensors
        )

        # Store the tokenized data with labels
        # we need to convert to torch tensors so that subsequent 'pad_sequence'
        # and 'stack' operations can work
        self.current_data = [
            {
                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
                "labels": torch.tensor(sampled_df.iloc[i]["entity_id"])  # Include the label
            }
            for i in range(len(sampled_df))
        ]


    def __len__(self):
        return len(self.current_data)

    def __getitem__(self, idx):
        return self.current_data[idx]


# %%
# Dynamic dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=False)
lean_df = df.drop(columns=['entity_name'])
dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)

# %%
# custom tokenization

# %%
# Example usage of dynamic dataset
sample = dynamic_dataset[0]
print(sample)


# %%
def custom_collate_fn(batch):
    # Dynamically pad tensors to the longest sequence in the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])

    # Pad inputs to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }


dataloader = DataLoader(
    dynamic_dataset,
    batch_size=32,
    collate_fn=custom_collate_fn
)

# %%
Implemented dynamic data re-sampling at each epoch 2025-01-16 19:41:03 +09:00			`# why?`
			`# the existing huggingface library does not allow for flexibility in changing`
			`# the training data between epochs`

			`# this code example illustrates the use of dataset regeneration to make changes`
			`# to the training data between epochs`
			`# %%`
			`from torch.utils.data import Dataset, DataLoader`

			`# from datasets import load_from_disk`
			`import os`

			`os.environ['NCCL_P2P_DISABLE'] = '1'`
			`os.environ['NCCL_IB_DISABLE'] = '1'`
			`os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"`
			`os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"`

			`import re`
			`import random`

			`import torch`
			`from transformers import (`
			`AutoTokenizer,`
			`AutoModelForSequenceClassification,`
			`DataCollatorWithPadding,`
			`Trainer,`
			`EarlyStoppingCallback,`
			`TrainingArguments`
			`)`
			`import evaluate`
			`import numpy as np`
			`import pandas as pd`
			`from functools import partial`
			`# import matplotlib.pyplot as plt`



			`torch.set_float32_matmul_precision('high')`

			`def set_seed(seed):`
			`"""`
			`Set the random seed for reproducibility.`
			`"""`
			`random.seed(seed) # Python random module`
			`np.random.seed(seed) # NumPy random`
			`torch.manual_seed(seed) # PyTorch CPU`
			`torch.cuda.manual_seed(seed) # PyTorch GPU`
			`torch.cuda.manual_seed_all(seed) # If using multiple GPUs`
			`torch.backends.cudnn.deterministic = True # Ensure deterministic behavior`
			`torch.backends.cudnn.benchmark = False # Disable optimization for reproducibility`

			`set_seed(42)`

			`# %%`
			`# PARAMETERS`
			`SAMPLES=5`

			`# %%`
			`# import training file`
			`data_path = '../../esAppMod_data_import/train.csv'`
			`df = pd.read_csv(data_path, skipinitialspace=True)`
			`# rather than use pattern, we use the real thing and property`
			`entity_ids = df['entity_id'].to_list()`
			`target_id_list = sorted(list(set(entity_ids)))`

			`id2label = {}`
			`label2id = {}`
			`for idx, val in enumerate(target_id_list):`
			`id2label[idx] = val`
			`label2id[val] = idx`

			`# %%`
			`# we want to sample n samples from each class`
			`# sample_size refers to the number of samples per class`
			`def sample_from_df(df, sample_size_per_class=5):`
			`sampled_df = (df.groupby( "entity_id")[['entity_id', 'mention']] # explicit give column names`
			`.apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))`
			`.reset_index(drop=True))`

			`return sampled_df`


			`# %%`
			`# augment whole dataset`
			`# for now, we just return the same df`
			`def augment_data(df):`
			`return df`

			`# %%`
			`class DynamicDataset(Dataset):`
			`def __init__(self, df, sample_size_per_class, tokenizer):`
			`"""`
			`Args:`
			`df (pd.DataFrame): Original DataFrame with class (id) and data columns.`
			`sample_size_per_class (int): Number of samples to draw per class for each epoch.`
			`"""`
			`self.df = df`
			`self.sample_size_per_class = sample_size_per_class`
			`self.tokenizer = tokenizer`
			`self.current_data = None`
			`self.regenerate_data() # Generate the initial dataset`

			`def regenerate_data(self):`
			`"""`
			`Generate a new sampled dataset for the current epoch.`

			`dynamic callback function to regenerate data each time we call this`
			`method, it updates the current_data we can:`

			`- re-sample the dataframe for a new set of n_samples`
			`- generate fresh augmentations this effectively`

			`This allows us to re-sample and re-augment at the start of each epoch`
			`"""`
			# Sample `sample_size_per_class` rows per class
			`sampled_df = sample_from_df(self.df, self.sample_size_per_class)`

			`# perform future augmentations here`
			`sampled_df = augment_data(sampled_df)`

			`# perform tokenization here`
			`# Batch tokenize the entire column of data`
			`tokenized_batch = self.tokenizer(`
			`sampled_df["mention"].to_list(), # Pass all text data at once`
			`truncation=True,`
			`# return_tensors="pt" # disabled because pt requires equal length tensors`
			`)`

			`# Store the tokenized data with labels`
			`# we need to convert to torch tensors so that subsequent 'pad_sequence'`
			`# and 'stack' operations can work`
			`self.current_data = [`
			`{`
			`"input_ids": torch.tensor(tokenized_batch["input_ids"][i]),`
			`"attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),`
			`"labels": torch.tensor(sampled_df.iloc[i]["entity_id"]) # Include the label`
			`}`
			`for i in range(len(sampled_df))`
			`]`


			`def __len__(self):`
			`return len(self.current_data)`

			`def __getitem__(self, idx):`
			`return self.current_data[idx]`


			`# %%`
			`# Dynamic dataset`
			`tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=False)`
			`lean_df = df.drop(columns=['entity_name'])`
			`dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)`

			`# %%`
			`# custom tokenization`

			`# %%`
			`# Example usage of dynamic dataset`
			`sample = dynamic_dataset[0]`
			`print(sample)`


			`# %%`
			`def custom_collate_fn(batch):`
			`# Dynamically pad tensors to the longest sequence in the batch`
			`input_ids = [item["input_ids"] for item in batch]`
			`attention_masks = [item["attention_mask"] for item in batch]`
			`labels = torch.stack([item["labels"] for item in batch])`

			`# Pad inputs to the same length`
			`input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)`
			`attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)`

			`return {`
			`"input_ids": input_ids,`
			`"attention_mask": attention_masks,`
			`"labels": labels`
			`}`


			`dataloader = DataLoader(`
			`dynamic_dataset,`
			`batch_size=32,`
			`collate_fn=custom_collate_fn`
			`)`

			`# %%`