Implemented dynamic data re-sampling at each epoch

2025-01-16 19:41:03 +09:00 · 2025-01-16 19:41:03 +09:00 · b6cf2d4416
parent 5312cfa06f
commit b6cf2d4416
80 changed files with 7050 additions and 14 deletions
--- a/analysis/error_analysis_esAppMod.py
+++ b/analysis/error_analysis_esAppMod.py
--- a/analysis_biomedical/data_properties.py
+++ b/analysis_biomedical/data_properties.py
@ -0,0 +1,58 @@
 # %%
 import pandas as pd
 # %%
 #############################
 # How much data
 # data_path = '../biomedical_data_import/bc2gm_test.csv'
 # data_path = '../biomedical_data_import/bc2gm_test.csv'
 data_path = '../biomedical_data_import/bc5cdr-chemical_train.csv'
 df = pd.read_csv(data_path)
 len(df)
 # %%
 # %%
 # bc2gm:
 # train: 288939
 # test: 1034
 # %%
 ################################
 # check for NA values
 df[df['mention'].isna()]
 # %%
 ##############################
 # how many labels?
 data_path = '../biomedical_data_import/bc2gm_test.csv'
 df = pd.read_csv(data_path)
 id_list = sorted(list(set(df['entity_id'].to_list())))
 # %%
 len(id_list)
 # %%
 for id in id_list:
    if isinstance(id,int):
        continue
    else:
        print(id)
 # %%
 # bc2gm:
 # 61641 - holy shit
 # %%
 ###############################
 # max length
 max_length = 0
 for mention in df['mention']:
    current_length = len(mention)
    if current_length > max_length:
        max_length = current_length
 print(max_length)
 # %%
--- a/analysis_biomedical/measure_tokenization_length.py
+++ b/analysis_biomedical/measure_tokenization_length.py
@ -0,0 +1,17 @@
 # %%
 from transformers import AutoTokenizer
 import pandas as pd
 data_path = '../biomedical_data_import/bc2gm_train.csv'
 df = pd.DataFrame(data_path)
 # Load the tokenizer (e.g., BERT tokenizer)
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 # %%
 # Calculate token lengths
 df['token_length'] = df['mention'].apply(lambda x: len(tokenizer.tokenize(x)))
 # Display the dataset with token lengths
 print(df)
--- a/biomedical_data_import/.gitignore
+++ b/biomedical_data_import/.gitignore
@ -0,0 +1 @@
 *.csv
--- a/biomedical_data_import/original_data_processing.py
+++ b/biomedical_data_import/original_data_processing.py
@ -0,0 +1,36 @@
 # %%
 from collections import defaultdict
 # %%
 data_name = 'bc2gm' # and the other 3 names
 train_path = 'test_dictionary.txt'
 test_path = 'processed_test_refined'
 # %%
 vocab = defaultdict(set)
 with open(f'../biomedical/{data_name}/{train_path}') as f:
    for line in f:            
        term_list = line.strip().split('||')
        vocab[term_list[0]].add(term_list[1].lower())
 cui_to_id, id_to_cui = {}, {}
 vocab_entity_id_mentions = {}
 for id, cui in enumerate(vocab):
    cui_to_id[cui] = id
    id_to_cui[id] = cui
 for cui, mention in vocab.items():
    vocab_entity_id_mentions[cui_to_id[cui]] = mention
 vocab_mentions, vocab_ids = [], []
 for id, mentions in vocab_entity_id_mentions.items():
    vocab_mentions.extend(mentions)
    vocab_ids.extend([id]*len(mentions))
 # %%
 test_mentions, test_cuis = [], []
 with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:
    for line in f:            
        term_list = line.strip().split('||')
        test_cuis.append(term_list[-1])
        test_mentions.append(term_list[-2].lower())
--- a/biomedical_data_import/process_to_df.py
+++ b/biomedical_data_import/process_to_df.py
@ -0,0 +1,134 @@
 # %%
 import pandas as pd
 from tqdm import tqdm
 import multiprocessing
 # %%
 #########################
 # we first process training data
 def process_train_to_csv(data_path, output):
    # data_path = '../esAppMod_data_import/parent_train.csv'
    input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
    input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',})
    # handle 'or' values in the number column
    df = input_df.copy()
    new_rows = []
    for idx,row in df.iterrows():
        index = row['entity_id']
        mention = row['mention']
        # omit nan values
        if row['mention'] == 'NaN' or pd.isna(row['mention']):
            df = df.drop(index=[idx])
            continue
        # handle possible multiple indices in index field
        if '|' in row['entity_id']:
            # print(row[0])
            df = df.drop(index=[idx])
            index_list = index.split('|')
            for new_index in index_list:
                element = {
                    'entity_id': new_index,
                    'mention': mention,
                }
                new_rows.append(element)
    df_new = pd.DataFrame(new_rows, columns=df.columns)
    df = pd.concat([df, df_new], ignore_index=True)
    df = df.reset_index(drop=True)
    df.to_csv(output, index=False)
 # %%
 name_list =[
    ('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'),
    ('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'),
    ('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'),
    ('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'),
 ]
 # for data_path, output in name_list:
 #     process_train_to_csv(data_path, output)
 if __name__ == "__main__":
    # Create a pool of workers
    num_workers = 4  # set number of cpus to use
    with multiprocessing.Pool(num_workers) as pool:
        # starmap
        # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
        pool.starmap(process_train_to_csv, name_list)
 # %%
 #################################################
 # process test data
 def is_int_string(s):
    try:
        int(s)
        return True
    except ValueError:
        return False
 def process_test_to_csv(data_path, output):
    # data_path = '../esAppMod_data_import/parent_train.csv'
    input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
    input_df = input_df.drop(columns=[0, 1, 2])
    input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'})
    # handle 'or' values in the number column
    df = input_df.copy()
    new_rows = []
    for idx,row in df.iterrows():
        # handle possible multiple indices
        if '|' in row['entity_id']:
            index = row['entity_id']
            mention = row['mention']
            df = df.drop(index=[idx])
            index_list = index.split('|')
            for new_index in index_list:
                element = {
                    'entity_id': new_index,
                    'mention': mention,
                }
                new_rows.append(element)
    df_new = pd.DataFrame(new_rows, columns=df.columns)
    df = pd.concat([df, df_new], ignore_index=True)
    df = df.reset_index(drop=True)
    # do some cleanup
    df['entity_id'].isna()
    df.to_csv(output, index=False)
 # %%
 name_list =[
    ('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'),
    ('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'),
    ('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'),
    ('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'),
 ]
 # for data_path, output in name_list:
 #     process_test_to_csv(data_path, output)
 if __name__ == "__main__":
    # Create a pool of workers
    num_workers = 4  # set number of cpus to use
    with multiprocessing.Pool(num_workers) as pool:
        # starmap
        # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
        pool.starmap(process_test_to_csv, name_list)
 # %%
 # %%
--- a/biomedical_train/bc2gm/augmentation/.gitignore
+++ b/biomedical_train/bc2gm/augmentation/.gitignore
--- a/biomedical_train/bc2gm/augmentation/dynamic_train.py
+++ b/biomedical_train/bc2gm/augmentation/dynamic_train.py
@ -0,0 +1,388 @@
 # %%
 from torch.utils.data import Dataset, DataLoader
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments,
    TrainerCallback
 )
 import evaluate
 import numpy as np
 import pandas as pd
 import math
 from functools import partial
 import warnings
 warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
 warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
 # import matplotlib.pyplot as plt
 torch.set_float32_matmul_precision('high')
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 # %%
 # PARAMETERS
 SAMPLES=20
 SHUFFLES=5
 AMPLIFY_FACTOR=5
 # %%
 ###################################################
 # import code
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 df["training_id"] = df["entity_id"].map(label2id)
 # %%
 ##############################################################
 # augmentation code
 # basic preprocessing
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 def shuffle_text(text, n_shuffles=SHUFFLES):
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 # %%
 def create_example(index, mention):
    return {'training_id': index, 'mention': mention}
 # augment whole dataset
 def augment_data(df):
    output_list = []
    for idx,row in df.iterrows():
        index = row['training_id']
        parent_desc = row['mention']
        parent_desc = preprocess_text(parent_desc) 
        # add basic example
        output_list.append(create_example(index, parent_desc))
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                output_list.append(create_example(index, desc))
        # add corrupted strings
        desc = corrupt_string(parent_desc, corruption_probability=0.1)
        if (desc != parent_desc):
            output_list.append(create_example(index, desc))
        # add example with stripped non-alphanumerics
        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        if (desc != parent_desc):
            output_list.append(create_example(index, desc))
        # short sequence amplifier
        # short sequences are rare, and we must compensate by including more examples
        # also, short sequence don't usually get affected by shuffle
        words = parent_desc.split()
        word_count = len(words)
        if word_count <= 2:
            for _ in range(AMPLIFY_FACTOR):
                output_list.append(create_example(index, desc))
    new_df = pd.DataFrame(output_list)
    return new_df
 ###############################################################
 # regeneration code
 # %%
 # we want to sample n samples from each class
 # sample_size refers to the number of samples per class
 def sample_from_df(df, sample_size_per_class=5):
    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
    .reset_index(drop=True))
    return sampled_df
 # %%
 class DynamicDataset(Dataset):
    def __init__(self, df, sample_size_per_class, tokenizer):
        """
        Args:
            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
            sample_size_per_class (int): Number of samples to draw per class for each epoch.
        """
        self.df = df
        self.sample_size_per_class = sample_size_per_class
        self.tokenizer = tokenizer
        self.current_data = None
        self.regenerate_data()  # Generate the initial dataset
    def regenerate_data(self):
        """
        Generate a new sampled dataset for the current epoch.
        dynamic callback function to regenerate data each time we call this
        method, it updates the current_data we can: 
        - re-sample the dataframe for a new set of n_samples 
        - generate fresh augmentations this effectively
        This allows us to re-sample and re-augment at the start of each epoch
        """
        # Sample `sample_size_per_class` rows per class
        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
        # perform future edits here
        sampled_df = augment_data(sampled_df)
        # perform tokenization here
        # Batch tokenize the entire column of data
        tokenized_batch = self.tokenizer(
            sampled_df["mention"].to_list(),  # Pass all text data at once
            truncation=True,
            # return_tensors="pt"  # disabled because pt requires equal length tensors
        )
        # Store the tokenized data with labels
        self.current_data = [
            {
                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
            }
            for i in range(len(sampled_df))
        ]
    def __len__(self):
        return len(self.current_data)
    def __getitem__(self, idx):
        return self.current_data[idx]
 # %%
 class RegenerateDatasetCallback(TrainerCallback):
    def __init__(self, dataset):
        self.dataset = dataset
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"Epoch {int(math.ceil(state.epoch + 1))}: Regenerating dataset")
        self.dataset.regenerate_data()
 # %%
 def custom_collate_fn(batch):
    # Dynamically pad tensors to the longest sequence in the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])
    # Pad inputs to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }
 ##########################################################################
 # training code
 # %%
 def train():
    save_path = f'checkpoint'
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
    # make the dataset
    # Define the callback
    lean_df = df.drop(columns=['entity_name'])
    dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
    # create the regeneration callback
    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset)
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=120,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=dynamic_dataset,
        tokenizer=tokenizer,
        data_collator=custom_collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[regeneration_callback]
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/biomedical_train/bc2gm/augmentation/prediction/.gitignore
+++ b/biomedical_train/bc2gm/augmentation/prediction/.gitignore
--- a/biomedical_train/bc2gm/augmentation/prediction/output.txt
+++ b/biomedical_train/bc2gm/augmentation/prediction/output.txt
@ -1,6 +1,6 @@
 *******************************************************************************
-Accuracy: 0.80197
+Accuracy: 0.80655
-F1 Score: 0.81948
+F1 Score: 0.82821
-Precision: 0.88067
+Precision: 0.87847
-Recall: 0.80197
+Recall: 0.80655
--- a/biomedical_train/bc2gm/augmentation/prediction/predict.py
+++ b/biomedical_train/bc2gm/augmentation/prediction/predict.py
@ -0,0 +1,236 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 32
 # %%
 # construct the target id list
 data_path = '../../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # target_id_list = [id for id in target_id_list]
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        row_id = row['entity_id']
        element = {
            'text' : desc,
            'labels': label2id[row_id], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../biomedical_data_import/bc2gm_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'test': Dataset.from_list(process_df_to_dict(test_df)),
    })
    return combined_data
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # %%
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    # print datasets['test'] columns
    column_info = datasets['test'].features
    for column, dtype in column_info.items():
        print(f"Column: {column}, Type: {dtype}")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        datasets['test'],
        batch_size=BATCH_SIZE, 
        shuffle=False,
        collate_fn=data_collator)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/biomedical_train/bc2gm/augmentation/train.py
+++ b/biomedical_train/bc2gm/augmentation/train.py
@ -0,0 +1,367 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0  # 0 shuffles means it does not re-sample
 # %%
 # We want to map the entity_id to a consecutive set of id's
 # import training file
 data_path = '../../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 #############################################################
 # Data Run code here
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        if isinstance(parent_desc, float):
            print(parent_desc)
            parent_desc = f'{parent_desc}'
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # # short sequences are rare, and we must compensate by including more examples
        # # mutation of other longer sequences might drown out rare short sequences
        # words = parent_desc.split()
        # word_count = len(words)
        # if word_count < 3:
        #     for _ in range(10):
        #         element = {
        #             'text': parent_desc,
        #             'label': label2id[index],
        #         }
        #         output_list.append(element)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # # corrupt string
        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # remove all non-alphanumerics
        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../biomedical_data_import/bc2gm_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 #########################################
 # training function
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True, # enable truncation for efficiency
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text", # we only need the tokenization, not the original strings
    )
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=512,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator, # data_collator performs dynamic padding
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/biomedical_train/bc2gm/simple/.gitignore
+++ b/biomedical_train/bc2gm/simple/.gitignore
--- a/biomedical_train/bc2gm/simple/dynamic_train.py
+++ b/biomedical_train/bc2gm/simple/dynamic_train.py
@ -0,0 +1,280 @@
 # %%
 from torch.utils.data import Dataset, DataLoader
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments,
    TrainerCallback
 )
 import evaluate
 import numpy as np
 import pandas as pd
 from functools import partial
 import warnings
 warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
 warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
 # import matplotlib.pyplot as plt
 torch.set_float32_matmul_precision('high')
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 # %%
 # PARAMETERS
 SAMPLES=20
 # %%
 ###################################################
 # import code
 # import training file
 data_path = '../../../biomedical_data_import/bc2gm_train.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 df["training_id"] = df["entity_id"].map(label2id)
 ###############################################################
 # regeneration code
 # %%
 # we want to sample n samples from each class
 # sample_size refers to the number of samples per class
 def sample_from_df(df, sample_size_per_class=5):
    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
    .reset_index(drop=True))
    return sampled_df
 # %%
 # augment whole dataset
 # for now, we just return the same df
 def augment_data(df):
    return df
 # %%
 class DynamicDataset(Dataset):
    def __init__(self, df, sample_size_per_class, tokenizer):
        """
        Args:
            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
            sample_size_per_class (int): Number of samples to draw per class for each epoch.
        """
        self.df = df
        self.sample_size_per_class = sample_size_per_class
        self.tokenizer = tokenizer
        self.current_data = None
        self.regenerate_data()  # Generate the initial dataset
    def regenerate_data(self):
        """
        Generate a new sampled dataset for the current epoch.
        dynamic callback function to regenerate data each time we call this
        method, it updates the current_data we can: 
        - re-sample the dataframe for a new set of n_samples 
        - generate fresh augmentations this effectively
        This allows us to re-sample and re-augment at the start of each epoch
        """
        # Sample `sample_size_per_class` rows per class
        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
        # perform future edits here
        sampled_df = augment_data(sampled_df)
        # perform tokenization here
        # Batch tokenize the entire column of data
        tokenized_batch = self.tokenizer(
            sampled_df["mention"].to_list(),  # Pass all text data at once
            truncation=True,
            # return_tensors="pt"  # disabled because pt requires equal length tensors
        )
        # Store the tokenized data with labels
        self.current_data = [
            {
                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
            }
            for i in range(len(sampled_df))
        ]
    def __len__(self):
        return len(self.current_data)
    def __getitem__(self, idx):
        return self.current_data[idx]
 # %%
 class RegenerateDatasetCallback(TrainerCallback):
    def __init__(self, dataset, every_n_epochs=2):
        """
        Args:
            dataset: The dataset instance that supports regeneration.
            every_n_epochs (int): Number of epochs to wait before regenerating the dataset.
        """
        self.dataset = dataset
        self.every_n_epochs = every_n_epochs
    def on_epoch_begin(self, args, state, control, **kwargs):
        # Check if the current epoch is a multiple of `every_n_epochs`
        if (state.epoch + 1) % self.every_n_epochs == 0:
            print(f"Epoch {int(state.epoch + 1)}: Regenerating dataset...")
            self.dataset.regenerate_data()
 # %%
 def custom_collate_fn(batch):
    # Dynamically pad tensors to the longest sequence in the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])
    # Pad inputs to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }
 ##########################################################################
 # training code
 # %%
 def train():
    save_path = f'checkpoint'
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
    # make the dataset
    # Define the callback
    # lean_df = df.drop(columns=['entity_name'])
    dynamic_dataset = DynamicDataset(df = df, sample_size_per_class=SAMPLES, tokenizer=tokenizer)
    # create the regeneration callback
    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset, every_n_epochs=2)
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-4,
        per_device_train_batch_size=256,
        # per_device_eval_batch_size=256,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=200,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=dynamic_dataset,
        tokenizer=tokenizer,
        data_collator=custom_collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[regeneration_callback]
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/biomedical_train/bc2gm/simple/prediction/.gitignore
+++ b/biomedical_train/bc2gm/simple/prediction/.gitignore
--- a/biomedical_train/bc2gm/simple/prediction/output.txt
+++ b/biomedical_train/bc2gm/simple/prediction/output.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.15093
 F1 Score: 0.14063
 Precision: 0.15594
 Recall: 0.15093
--- a/biomedical_train/bc2gm/simple/prediction/predict.py
+++ b/biomedical_train/bc2gm/simple/prediction/predict.py
@ -0,0 +1,246 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 32
 # %%
 # construct the target id list
 data_path = '../../../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # target_id_list = [id for id in target_id_list]
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def is_int_string(s):
    try:
        int(s)
        return True
    except ValueError:
        return False
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        row_id = row['entity_id']
        if not is_int_string(row_id):
            continue
        row_id = int(row_id)
        desc = row['mention']
        desc = preprocess_text(desc)
        element = {
            'text' : desc,
            'labels': label2id[row_id], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../../biomedical_data_import/bc2gm_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'test': Dataset.from_list(process_df_to_dict(test_df)),
    })
    return combined_data
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # %%
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    # print datasets['test'] columns
    column_info = datasets['test'].features
    for column, dtype in column_info.items():
        print(f"Column: {column}, Type: {dtype}")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        datasets['test'],
        batch_size=BATCH_SIZE, 
        shuffle=False,
        collate_fn=data_collator)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/biomedical_train/bc2gm/simple/train.py
+++ b/biomedical_train/bc2gm/simple/train.py
@ -0,0 +1,368 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0  # 0 shuffles means it does not re-sample
 # %%
 # We want to map the entity_id to a consecutive set of id's
 # import training file
 data_path = '../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 #############################################################
 # Data Run code here
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        if isinstance(parent_desc, float):
            print(parent_desc)
            parent_desc = f'{parent_desc}'
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # # short sequences are rare, and we must compensate by including more examples
        # # mutation of other longer sequences might drown out rare short sequences
        # words = parent_desc.split()
        # word_count = len(words)
        # if word_count < 3:
        #     for _ in range(10):
        #         element = {
        #             'text': parent_desc,
        #             'label': label2id[index],
        #         }
        #         output_list.append(element)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # # corrupt string
        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # remove all non-alphanumerics
        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../biomedical_data_import/bc2gm_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 #########################################
 # training function
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True, # enable truncation for efficiency
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text", # we only need the tokenization, not the original strings
    )
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=512,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator, # data_collator performs dynamic padding
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/biomedical_train/bc5cdr-chemical/augmentation/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/augmentation/.gitignore
--- a/biomedical_train/bc5cdr-chemical/augmentation/prediction/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/.gitignore
--- a/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt
+++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt
@ -0,0 +1 @@
--- a/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py
+++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py
@ -0,0 +1,236 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # target_id_list = [id for id in target_id_list]
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        row_id = row['entity_id']
        element = {
            'text' : desc,
            'labels': label2id[row_id], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../biomedical_data_import/bc5cdr-chemical_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'test': Dataset.from_list(process_df_to_dict(test_df)),
    })
    return combined_data
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # %%
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    # print datasets['test'] columns
    column_info = datasets['test'].features
    for column, dtype in column_info.items():
        print(f"Column: {column}, Type: {dtype}")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        datasets['test'],
        batch_size=BATCH_SIZE, 
        shuffle=False,
        collate_fn=data_collator)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/biomedical_train/bc5cdr-chemical/augmentation/train.py
+++ b/biomedical_train/bc5cdr-chemical/augmentation/train.py
@ -0,0 +1,368 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0  # 0 shuffles means it does not re-sample
 # %%
 # We want to map the entity_id to a consecutive set of id's
 # import training file
 data_path = '../../biomedical_data_import/bc5cdr-chemical_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 #############################################################
 # Data Run code here
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        if isinstance(parent_desc, float):
            print(parent_desc)
            parent_desc = f'{parent_desc}'
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'labels': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # # short sequences are rare, and we must compensate by including more examples
        # # mutation of other longer sequences might drown out rare short sequences
        # words = parent_desc.split()
        # word_count = len(words)
        # if word_count < 3:
        #     for _ in range(10):
        #         element = {
        #             'text': parent_desc,
        #             'labels': label2id[index],
        #         }
        #         output_list.append(element)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'labels': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # # corrupt string
        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'labels': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # remove all non-alphanumerics
        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'labels': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../biomedical_data_import/bc5cdr-chemical.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 #########################################
 # training function
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True, # enable truncation for efficiency
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text", # we only need the tokenization, not the original strings
    )
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=512,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator, # data_collator performs dynamic padding
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/biomedical_train/bc5cdr-chemical/simple/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/simple/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt
+++ b/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.04872
 F1 Score: 0.04283
 Precision: 0.04903
 Recall: 0.04872
--- a/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py
+++ b/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py
@ -0,0 +1,234 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 32
 # %%
 # construct the target id list
 data_path = '../../../../biomedical_data_import/bc5cdr-chemical_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # target_id_list = [id for id in target_id_list]
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        row_id = row['entity_id']
        element = {
            'text' : desc,
            'labels': label2id[row_id], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../../biomedical_data_import/bc5cdr-chemical_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'test': Dataset.from_list(process_df_to_dict(test_df)),
    })
    return combined_data
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # %%
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    column_info = datasets['test'].features
    for column, dtype in column_info.items():
        print(f"Column: {column}, Type: {dtype}")
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        datasets['test'],
        batch_size=BATCH_SIZE, 
        shuffle=False,
        collate_fn=data_collator)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/biomedical_train/bc5cdr-chemical/simple/train.py
+++ b/biomedical_train/bc5cdr-chemical/simple/train.py
@ -0,0 +1,367 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0  # 0 shuffles means it does not re-sample
 # %%
 # We want to map the entity_id to a consecutive set of id's
 # import training file
 data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 #############################################################
 # Data Run code here
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        if isinstance(parent_desc, float):
            print(parent_desc)
            parent_desc = f'{parent_desc}'
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'labels': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # # short sequences are rare, and we must compensate by including more examples
        # # mutation of other longer sequences might drown out rare short sequences
        # words = parent_desc.split()
        # word_count = len(words)
        # if word_count < 3:
        #     for _ in range(10):
        #         element = {
        #             'text': parent_desc,
        #             'labels': label2id[index],
        #         }
        #         output_list.append(element)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'labels': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # # corrupt string
        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'labels': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # remove all non-alphanumerics
        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'labels': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 #########################################
 # training function
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True, # enable truncation for efficiency
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text", # we only need the tokenization, not the original strings
    )
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator, # data_collator performs dynamic padding
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
--- a/biomedical_train/bc5cdr-disease/.gitignore
+++ b/biomedical_train/bc5cdr-disease/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/biomedical_train/bc5cdr-disease/prediction/.gitignore
+++ b/biomedical_train/bc5cdr-disease/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/biomedical_train/bc5cdr-disease/prediction/output.txt
+++ b/biomedical_train/bc5cdr-disease/prediction/output.txt
@ -0,0 +1 @@
--- a/biomedical_train/bc5cdr-disease/prediction/predict.py
+++ b/biomedical_train/bc5cdr-disease/prediction/predict.py
@ -0,0 +1,236 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 data_path = '../../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # target_id_list = [id for id in target_id_list]
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        row_id = row['entity_id']
        element = {
            'text' : desc,
            'labels': label2id[row_id], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../biomedical_data_import/bc2gm_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'test': Dataset.from_list(process_df_to_dict(test_df)),
    })
    return combined_data
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # %%
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    # print datasets['test'] columns
    column_info = datasets['test'].features
    for column, dtype in column_info.items():
        print(f"Column: {column}, Type: {dtype}")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        datasets['test'],
        batch_size=BATCH_SIZE, 
        shuffle=False,
        collate_fn=data_collator)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/biomedical_train/bc5cdr-disease/train.py
+++ b/biomedical_train/bc5cdr-disease/train.py
@ -0,0 +1,368 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0  # 0 shuffles means it does not re-sample
 # %%
 # We want to map the entity_id to a consecutive set of id's
 # import training file
 data_path = '../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 #############################################################
 # Data Run code here
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        if isinstance(parent_desc, float):
            print(parent_desc)
            parent_desc = f'{parent_desc}'
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # # short sequences are rare, and we must compensate by including more examples
        # # mutation of other longer sequences might drown out rare short sequences
        # words = parent_desc.split()
        # word_count = len(words)
        # if word_count < 3:
        #     for _ in range(10):
        #         element = {
        #             'text': parent_desc,
        #             'label': label2id[index],
        #         }
        #         output_list.append(element)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # # corrupt string
        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # remove all non-alphanumerics
        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../biomedical_data_import/bc2gm_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 #########################################
 # training function
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True, # enable truncation for efficiency
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text", # we only need the tokenization, not the original strings
    )
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=512,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator, # data_collator performs dynamic padding
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/biomedical_train/ncbi/.gitignore
+++ b/biomedical_train/ncbi/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/biomedical_train/ncbi/prediction/.gitignore
+++ b/biomedical_train/ncbi/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/biomedical_train/ncbi/prediction/output.txt
+++ b/biomedical_train/ncbi/prediction/output.txt
@ -0,0 +1 @@
--- a/biomedical_train/ncbi/prediction/predict.py
+++ b/biomedical_train/ncbi/prediction/predict.py
@ -0,0 +1,236 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 data_path = '../../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # target_id_list = [id for id in target_id_list]
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        row_id = row['entity_id']
        element = {
            'text' : desc,
            'labels': label2id[row_id], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../biomedical_data_import/bc2gm_test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'test': Dataset.from_list(process_df_to_dict(test_df)),
    })
    return combined_data
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # %%
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    # print datasets['test'] columns
    column_info = datasets['test'].features
    for column, dtype in column_info.items():
        print(f"Column: {column}, Type: {dtype}")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        datasets['test'],
        batch_size=BATCH_SIZE, 
        shuffle=False,
        collate_fn=data_collator)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/biomedical_train/ncbi/train.py
+++ b/biomedical_train/ncbi/train.py
@ -0,0 +1,368 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=0  # 0 shuffles means it does not re-sample
 # %%
 # We want to map the entity_id to a consecutive set of id's
 # import training file
 data_path = '../../biomedical_data_import/bc2gm_train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 #############################################################
 # Data Run code here
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        if isinstance(parent_desc, float):
            print(parent_desc)
            parent_desc = f'{parent_desc}'
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # # short sequences are rare, and we must compensate by including more examples
        # # mutation of other longer sequences might drown out rare short sequences
        # words = parent_desc.split()
        # word_count = len(words)
        # if word_count < 3:
        #     for _ in range(10):
        #         element = {
        #             'text': parent_desc,
        #             'label': label2id[index],
        #         }
        #         output_list.append(element)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'label': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # # corrupt string
        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # remove all non-alphanumerics
        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../biomedical_data_import/bc2gm_train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 #########################################
 # training function
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True, # enable truncation for efficiency
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text", # we only need the tokenization, not the original strings
    )
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=512,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator, # data_collator performs dynamic padding
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/esAppMod_train/augmentation/.gitignore
+++ b/esAppMod_train/augmentation/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/esAppMod_train/augmentation/dynamic_train.py
+++ b/esAppMod_train/augmentation/dynamic_train.py
@ -0,0 +1,388 @@
 # %%
 from torch.utils.data import Dataset, DataLoader
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments,
    TrainerCallback
 )
 import evaluate
 import numpy as np
 import pandas as pd
 import math
 from functools import partial
 import warnings
 warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
 warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
 # import matplotlib.pyplot as plt
 torch.set_float32_matmul_precision('high')
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 # %%
 # PARAMETERS
 SAMPLES=20
 SHUFFLES=5
 AMPLIFY_FACTOR=5
 # %%
 ###################################################
 # import code
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 df["training_id"] = df["entity_id"].map(label2id)
 # %%
 ##############################################################
 # augmentation code
 # basic preprocessing
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 def shuffle_text(text, n_shuffles=SHUFFLES):
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 # %%
 def create_example(index, mention):
    return {'training_id': index, 'mention': mention}
 # augment whole dataset
 def augment_data(df):
    output_list = []
    for idx,row in df.iterrows():
        index = row['training_id']
        parent_desc = row['mention']
        parent_desc = preprocess_text(parent_desc) 
        # add basic example
        output_list.append(create_example(index, parent_desc))
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                output_list.append(create_example(index, desc))
        # add corrupted strings
        desc = corrupt_string(parent_desc, corruption_probability=0.1)
        if (desc != parent_desc):
            output_list.append(create_example(index, desc))
        # add example with stripped non-alphanumerics
        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        if (desc != parent_desc):
            output_list.append(create_example(index, desc))
        # short sequence amplifier
        # short sequences are rare, and we must compensate by including more examples
        # also, short sequence don't usually get affected by shuffle
        words = parent_desc.split()
        word_count = len(words)
        if word_count <= 2:
            for _ in range(AMPLIFY_FACTOR):
                output_list.append(create_example(index, desc))
    new_df = pd.DataFrame(output_list)
    return new_df
 ###############################################################
 # regeneration code
 # %%
 # we want to sample n samples from each class
 # sample_size refers to the number of samples per class
 def sample_from_df(df, sample_size_per_class=5):
    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
    .reset_index(drop=True))
    return sampled_df
 # %%
 class DynamicDataset(Dataset):
    def __init__(self, df, sample_size_per_class, tokenizer):
        """
        Args:
            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
            sample_size_per_class (int): Number of samples to draw per class for each epoch.
        """
        self.df = df
        self.sample_size_per_class = sample_size_per_class
        self.tokenizer = tokenizer
        self.current_data = None
        self.regenerate_data()  # Generate the initial dataset
    def regenerate_data(self):
        """
        Generate a new sampled dataset for the current epoch.
        dynamic callback function to regenerate data each time we call this
        method, it updates the current_data we can: 
        - re-sample the dataframe for a new set of n_samples 
        - generate fresh augmentations this effectively
        This allows us to re-sample and re-augment at the start of each epoch
        """
        # Sample `sample_size_per_class` rows per class
        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
        # perform future edits here
        sampled_df = augment_data(sampled_df)
        # perform tokenization here
        # Batch tokenize the entire column of data
        tokenized_batch = self.tokenizer(
            sampled_df["mention"].to_list(),  # Pass all text data at once
            truncation=True,
            # return_tensors="pt"  # disabled because pt requires equal length tensors
        )
        # Store the tokenized data with labels
        self.current_data = [
            {
                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
            }
            for i in range(len(sampled_df))
        ]
    def __len__(self):
        return len(self.current_data)
    def __getitem__(self, idx):
        return self.current_data[idx]
 # %%
 class RegenerateDatasetCallback(TrainerCallback):
    def __init__(self, dataset):
        self.dataset = dataset
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"Epoch {int(math.ceil(state.epoch + 1))}: Regenerating dataset")
        self.dataset.regenerate_data()
 # %%
 def custom_collate_fn(batch):
    # Dynamically pad tensors to the longest sequence in the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])
    # Pad inputs to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }
 ##########################################################################
 # training code
 # %%
 def train():
    save_path = f'checkpoint'
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
    # make the dataset
    # Define the callback
    lean_df = df.drop(columns=['entity_name'])
    dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=SAMPLES, tokenizer=tokenizer)
    # create the regeneration callback
    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset)
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        # per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=120,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=dynamic_dataset,
        tokenizer=tokenizer,
        data_collator=custom_collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[regeneration_callback]
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/esAppMod_train/augmentation/prediction/.gitignore
+++ b/esAppMod_train/augmentation/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/esAppMod_train/augmentation/prediction/output.txt
+++ b/esAppMod_train/augmentation/prediction/output.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.76958
 F1 Score: 0.79382
 Precision: 0.88705
 Recall: 0.76958
--- a/train/class_bert_augmentation/prediction/predict.py
+++ b/train/class_bert_augmentation/prediction/predict.py
--- a/esAppMod_train/class_bert_augmentation/.gitignore
+++ b/esAppMod_train/class_bert_augmentation/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/esAppMod_train/class_bert_augmentation/prediction/.gitignore
+++ b/esAppMod_train/class_bert_augmentation/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/esAppMod_train/class_bert_augmentation/prediction/output.txt
+++ b/esAppMod_train/class_bert_augmentation/prediction/output.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.80689
 F1 Score: 0.82527
 Precision: 0.89684
 Recall: 0.80689
--- a/esAppMod_train/class_bert_augmentation/prediction/predict.py
+++ b/esAppMod_train/class_bert_augmentation/prediction/predict.py
@ -0,0 +1,264 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 # data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        index = row['entity_id']
        element = {
            'text' : desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../esAppMod_data_import/test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # combined_data = DatasetDict({
    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
    # })
    return Dataset.from_list(process_df_to_dict(test_df))
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    # %%
    # compute max token length
    max_length = 0
    for sample in test_dataset['text']:
        # Tokenize the sample and get the length
        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
        length = len(input_ids)
        # Update max_length if this sample is longer
        if length > max_length:
            max_length = length
    print(max_length)
    # %%
    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            # truncation=True,
            padding='max_length'
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
    # %%
    # compute metrics
    # metric = evaluate.load("accuracy")
    # 
    # 
    # def compute_metrics(eval_preds):
    #     preds, labels = eval_preds
    #     preds = np.argmax(preds, axis=1)
    #     return metric.compute(predictions=preds, references=labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['label'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/esAppMod_train/class_bert_augmentation/train.py
+++ b/esAppMod_train/class_bert_augmentation/train.py
@ -0,0 +1,558 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=5
 # %%
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with 'x'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def generate_random_shuffles(text, n):
    """
    Generate n strings with randomly shuffled words from the input text.
    Args:
        text (str): The input text.
        n (int): The number of random variations to generate.
    Returns:
        list: A list of strings with shuffled words.
    """
    words = text.split()  # Split the input into words
    shuffled_variations = []
    for _ in range(n):
        shuffled = words[:]  # Copy the word list to avoid in-place modification
        random.shuffle(shuffled)  # Randomly shuffle the words
        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
    return shuffled_variations
 # generate n more shuffled examples
 def shuffle_text(text, n_shuffles=SHUFFLES):
    """
    Preprocess a list of texts and add n random shuffles for each string.
    Args:
        texts (list): An input strings.
        n_shuffles (int): Number of random shuffles to generate for each string.
    Returns:
        list: A list of preprocessed and shuffled strings.
    """
    all_processed = []
    # add the original text
    all_processed.append(text)
    # Generate random shuffles
    shuffled_variations = generate_random_shuffles(text, n_shuffles)
    all_processed.extend(shuffled_variations)
    return all_processed
 acronym_mapping = {
 'hpsa': 'hp server automation',
 'tam': 'tivoli access manager',
 'adf': 'application development facility',
 'html': 'hypertext markup language',
 'wff': 'microsoft web farm framework',
 'jsp': 'javaserver pages',
 'bw': 'business works',
 'ssrs': 'sql server reporting services',
 'cl': 'control language',
 'vba': 'visual basic for applications',
 'esapi': 'enterprise security api',
 'gwt': 'google web toolkit',
 'pki': 'perkin elmer informatics',
 'rtd': 'oracle realtime decisions',
 'jms': 'java message service',
 'db': 'database',
 'soa': 'service oriented architecture',
 'xsl': 'extensible stylesheet language',
 'com': 'compopent object model',
 'ldap': 'lightweight directory access protocol',
 'odm': 'ibm operational decision manager',
 'soql': 'salesforce object query language',
 'oms': 'order management system',
 'cfml': 'coldfusion markup language',
 'nas': 'netscape application server',
 'sql': 'structured query language',
 'bde': 'borland database engine',
 'imap': 'internet message access protocol',
 'uws': 'ultidev web server',
 'birt': 'business intelligence and reporting tools',
 'mdw': 'model driven workflow',
 'tws': 'tivoli workload scheduler',
 'jre': 'java runtime environment',
 'wcs': 'websphere commerce suite',
 'was': 'websphere application server',
 'ssis': 'sql server integration services',
 'xhtml': 'extensible hypertext markup language',
 'soap': 'simple object access protocol',
 'san': 'storage area network',
 'elk': 'elastic stack',
 'arr': 'application request routing',
 'xlst': 'extensible stylesheet language transformations',
 'sccm': 'microsoft endpoint configuration manager',
 'ejb': 'enterprise java beans',
 'css': 'cascading style sheets',
 'hpoo': 'hp operations orchestration',
 'xml': 'extensible markup language',
 'esb': 'enterprise service bus',
 'edi': 'electronic data interchange',
 'imsva': 'interscan messaging security virtual appliance',
 'wtx': 'ibm websphere transformation extender',
 'cgi': 'common gateway interface',
 'bal': 'ibm basic assembly language',
 'issow': 'integrated safe system of work',
 'dcl': 'data control language',
 'jdom': 'java document object model',
 'fim': 'microsoft forefront identity manager',
 'npl': 'niakwa programming language',
 'wf': 'windows workflow foundation',
 'lm': 'etap license manager',
 'wts': 'windows terminal server',
 'asp': 'active server pages',
 'jil': 'job information language',
 'mvc': 'model view controller',
 'rmi': 'remote method invocation',
 'ad': 'active directory',
 'owb': 'oracle warehouse builder',
 'rest': 'representational state transfer',
 'jdk': 'java development kit',
 'ids': 'integrated data store',
 'bms': 'batch management software',
 'vsx': 'vmware solution exchange',
 'ssas': 'sql server analysis services',
 'atl': 'atlas transformation language',
 'ice': 'infobright community edition',
 'esql': 'extended structured query language',
 'corba': 'common object request broker architecture',
 'dpe': 'device provisioning engines',
 'rac': 'oracle real application clusters',
 'iemt': 'iis easy migration tool',
 'mes': 'manufacturing execution system',
 'odbc': 'open database connectivity',
 'lms': 'lan management solution',
 'wcf': 'windows communication foundation',
 'nes': 'netscape enterprise server',
 'jsf': 'javaserver faces',
 'alm': 'application lifecycle management',
 'hlasm': 'high level assembler',
 'cmod': 'content manager ondemand'}
 external_source = {
 'vb.net': 'visual basic dot net',
 'jes': 'job entry subsystem',
 'svn': 'subversion',
 'vcs': 'version control system',
 'lims': 'laboratory information management system',
 'ide': 'integrated development environment',
 'sdk': 'software development kit',
 'mq': 'message queue',
 'ims': 'information management system',
 'isa': 'internet security and acceleration',
 'vs': 'visual studio',
 'esr': 'extended support release',
 'ff': 'firefox',
 'vb': 'visual basic',
 'rhel': 'red hat enterprise linux',
 'iis': 'internet information server',
 'api': 'application programming interface',
 'se': 'standard edition',
 '\.net': 'dot net',
 'c#': 'c sharp'
 }
 # synonyms = {
 #  'windows server': 'windows nt',
 #  'windows 7': 'windows desktop',
 #  'windows 8': 'windows desktop',
 #  'windows 10': 'windows desktop'
 # }
 # add more information
 acronym_mapping.update(external_source)
 abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
 term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
 def replace_terms_with_abbreviations(text):
    for input, replacement in term_to_abbrev.items():
        text = re.sub(input, replacement, text)
    return text
 def replace_abbreviations_with_terms(text):
    for input, replacement in abbrev_to_term.items():
        text = re.sub(input, replacement, text)
    return text
 ######################################
 # augmentation by text corruption
 def corrupt_word(word):
    """Corrupt a single word using random corruption techniques."""
    if len(word) <= 1:  # Skip corruption for single-character words
        return word
    corruption_type = random.choice(["delete", "swap"])
    if corruption_type == "delete":
        # Randomly delete a character
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]
    elif corruption_type == "swap":
        # Swap two adjacent characters
        if len(word) > 1:
            idx = random.randint(0, len(word) - 2)
            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
    return word
 def corrupt_string(sentence, corruption_probability=0.01):
    """Corrupt each word in the string with a given probability."""
    words = sentence.split()
    corrupted_words = [
        corrupt_word(word) if random.random() < corruption_probability else word
        for word in words
    ]
    return " ".join(corrupted_words)
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 label_flag_list = []
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'labels': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
        # short sequences are rare, and we must compensate by including more examples
        # mutation of other longer sequences might drown out rare short sequences
        words = parent_desc.split()
        word_count = len(words)
        if word_count < 3:
            for _ in range(10):
                element = {
                    'text': parent_desc,
                    'label': label2id[index],
                }
                output_list.append(element)
        # check if label is in label_flag_list
        if index not in label_flag_list:
            entity_name = row['entity_name']
            # add the "entity_name" label as a mention
            element = {
                'text': entity_name,
                'labels': label2id[index],
            }
            output_list.append(element)
            # remove all non-alphanumerics
            desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'labels': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
            # add shufles of the original entity name
            no_of_shuffles = SHUFFLES
            processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles)
            for desc in processed_descs:
                if (desc != parent_desc):
                    element = {
                        'text' : desc,
                        'labels': label2id[index], # ensure labels starts from 0
                    }
                    output_list.append(element)
            label_flag_list.append(index)
        # add shuffled strings
        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
        for desc in processed_descs:
            if (desc != parent_desc):
                element = {
                    'text' : desc,
                    'labels': label2id[index], # ensure labels starts from 0
                }
                output_list.append(element)
        # corrupt string
        desc = corrupt_string(parent_desc, corruption_probability=0.1)
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'labels': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # augmentation
        # remove all non-alphanumerics
        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        if (desc != parent_desc):
            element = {
                'text' : desc,
                'labels': label2id[index], # ensure labels starts from 0
            }
            output_list.append(element)
        # # augmentation
        # # perform abbrev_to_term
        # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
        # desc = replace_terms_with_abbreviations(temp_desc)
        # if (desc != temp_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
        # # augmentation
        # # perform term to abbrev
        # desc = replace_abbreviations_with_terms(parent_desc)
        # if (desc != parent_desc):
        #     element = {
        #         'text' : desc,
        #         'label': label2id[index], # ensure labels starts from 0
        #     }
        #     output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../esAppMod_data_import/train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/esAppMod_train/class_bert_hierarchical/.gitignore
+++ b/esAppMod_train/class_bert_hierarchical/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/esAppMod_train/class_bert_hierarchical/prediction/.gitignore
+++ b/esAppMod_train/class_bert_hierarchical/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/esAppMod_train/class_bert_hierarchical/prediction/output.txt
+++ b/esAppMod_train/class_bert_hierarchical/prediction/output.txt
--- a/esAppMod_train/class_bert_hierarchical/prediction/output_1.txt
+++ b/esAppMod_train/class_bert_hierarchical/prediction/output_1.txt
--- a/esAppMod_train/class_bert_hierarchical/prediction/output_2.txt
+++ b/esAppMod_train/class_bert_hierarchical/prediction/output_2.txt
--- a/esAppMod_train/class_bert_hierarchical/prediction/predict_1.py
+++ b/esAppMod_train/class_bert_hierarchical/prediction/predict_1.py
--- a/esAppMod_train/class_bert_hierarchical/prediction/predict_2.py
+++ b/esAppMod_train/class_bert_hierarchical/prediction/predict_2.py
--- a/esAppMod_train/class_bert_hierarchical/train_1.py
+++ b/esAppMod_train/class_bert_hierarchical/train_1.py
--- a/esAppMod_train/class_bert_hierarchical/train_2.py
+++ b/esAppMod_train/class_bert_hierarchical/train_2.py
--- a/esAppMod_train/class_bert_simple/.gitignore
+++ b/esAppMod_train/class_bert_simple/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/esAppMod_train/class_bert_simple/classification_prediction/.gitignore
+++ b/esAppMod_train/class_bert_simple/classification_prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/esAppMod_train/class_bert_simple/classification_prediction/output.txt
+++ b/esAppMod_train/class_bert_simple/classification_prediction/output.txt
--- a/esAppMod_train/class_bert_simple/classification_prediction/predict.py
+++ b/esAppMod_train/class_bert_simple/classification_prediction/predict.py
--- a/esAppMod_train/class_bert_simple/train.py
+++ b/esAppMod_train/class_bert_simple/train.py
--- a/esAppMod_train/golden_sample/.gitignore
+++ b/esAppMod_train/golden_sample/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/esAppMod_train/golden_sample/prediction/.gitignore
+++ b/esAppMod_train/golden_sample/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/esAppMod_train/golden_sample/prediction/output.txt
+++ b/esAppMod_train/golden_sample/prediction/output.txt
@ -0,0 +1,6 @@
 *******************************************************************************
 Accuracy: 0.80689
 F1 Score: 0.82527
 Precision: 0.89684
 Recall: 0.80689
--- a/esAppMod_train/golden_sample/prediction/predict.py
+++ b/esAppMod_train/golden_sample/prediction/predict.py
@ -0,0 +1,264 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 # data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        index = row['entity_id']
        element = {
            'text' : desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../esAppMod_data_import/test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # combined_data = DatasetDict({
    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
    # })
    return Dataset.from_list(process_df_to_dict(test_df))
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    # %%
    # compute max token length
    max_length = 0
    for sample in test_dataset['text']:
        # Tokenize the sample and get the length
        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
        length = len(input_ids)
        # Update max_length if this sample is longer
        if length > max_length:
            max_length = length
    print(max_length)
    # %%
    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            # truncation=True,
            padding='max_length'
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
    # %%
    # compute metrics
    # metric = evaluate.load("accuracy")
    # 
    # 
    # def compute_metrics(eval_preds):
    #     preds, labels = eval_preds
    #     preds = np.argmax(preds, axis=1)
    #     return metric.compute(predictions=preds, references=labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['label'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/train/class_bert_augmentation/train.py
+++ b/train/class_bert_augmentation/train.py
@ -45,7 +45,7 @@ def set_seed(seed):
 set_seed(42)
-SHUFFLES=10
+SHUFFLES=5
 # %%
@ -411,15 +411,15 @@ def process_df_to_dict(df):
        #     }
        #     output_list.append(element)
-        # augmentation
+        # # augmentation
-        # perform term to abbrev
+        # # perform term to abbrev
-        desc = replace_abbreviations_with_terms(parent_desc)
+        # desc = replace_abbreviations_with_terms(parent_desc)
-        if (desc != parent_desc):
+        # if (desc != parent_desc):
-            element = {
+        #     element = {
-                'text' : desc,
+        #         'text' : desc,
-                'label': label2id[index], # ensure labels starts from 0
+        #         'label': label2id[index], # ensure labels starts from 0
-            }
+        #     }
-            output_list.append(element)
+        #     output_list.append(element)
    return output_list
--- a/esAppMod_train/seq2seq_t5_simple/.gitignore
+++ b/esAppMod_train/seq2seq_t5_simple/.gitignore
--- a/esAppMod_train/seq2seq_t5_simple/prediction/.gitignore
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/.gitignore
--- a/esAppMod_train/seq2seq_t5_simple/prediction/inference.py
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/inference.py
--- a/esAppMod_train/seq2seq_t5_simple/prediction/output.txt
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/output.txt
--- a/esAppMod_train/seq2seq_t5_simple/prediction/predict.py
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/predict.py
--- a/esAppMod_train/seq2seq_t5_simple/train.py
+++ b/esAppMod_train/seq2seq_t5_simple/train.py
--- a/esAppMod_train/simple/.gitignore
+++ b/esAppMod_train/simple/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/esAppMod_train/simple/dynamic_train.py
+++ b/esAppMod_train/simple/dynamic_train.py
@ -0,0 +1,273 @@
 # %%
 from torch.utils.data import Dataset, DataLoader
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments,
    TrainerCallback
 )
 import evaluate
 import numpy as np
 import pandas as pd
 from functools import partial
 import warnings
 warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
 warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
 # import matplotlib.pyplot as plt
 torch.set_float32_matmul_precision('high')
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 # %%
 # PARAMETERS
 SAMPLES=20
 # %%
 ###################################################
 # import code
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 df["training_id"] = df["entity_id"].map(label2id)
 ###############################################################
 # regeneration code
 # %%
 # we want to sample n samples from each class
 # sample_size refers to the number of samples per class
 def sample_from_df(df, sample_size_per_class=5):
    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
    .reset_index(drop=True))
    return sampled_df
 # %%
 # augment whole dataset
 # for now, we just return the same df
 def augment_data(df):
    return df
 # %%
 class DynamicDataset(Dataset):
    def __init__(self, df, sample_size_per_class, tokenizer):
        """
        Args:
            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
            sample_size_per_class (int): Number of samples to draw per class for each epoch.
        """
        self.df = df
        self.sample_size_per_class = sample_size_per_class
        self.tokenizer = tokenizer
        self.current_data = None
        self.regenerate_data()  # Generate the initial dataset
    def regenerate_data(self):
        """
        Generate a new sampled dataset for the current epoch.
        dynamic callback function to regenerate data each time we call this
        method, it updates the current_data we can: 
        - re-sample the dataframe for a new set of n_samples 
        - generate fresh augmentations this effectively
        This allows us to re-sample and re-augment at the start of each epoch
        """
        # Sample `sample_size_per_class` rows per class
        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
        # perform future edits here
        sampled_df = augment_data(sampled_df)
        # perform tokenization here
        # Batch tokenize the entire column of data
        tokenized_batch = self.tokenizer(
            sampled_df["mention"].to_list(),  # Pass all text data at once
            truncation=True,
            # return_tensors="pt"  # disabled because pt requires equal length tensors
        )
        # Store the tokenized data with labels
        self.current_data = [
            {
                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
            }
            for i in range(len(sampled_df))
        ]
    def __len__(self):
        return len(self.current_data)
    def __getitem__(self, idx):
        return self.current_data[idx]
 # %%
 class RegenerateDatasetCallback(TrainerCallback):
    def __init__(self, dataset):
        self.dataset = dataset
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch + 1}: Regenerating dataset")
        self.dataset.regenerate_data()
 # %%
 def custom_collate_fn(batch):
    # Dynamically pad tensors to the longest sequence in the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])
    # Pad inputs to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }
 ##########################################################################
 # training code
 # %%
 def train():
    save_path = f'checkpoint'
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
    # make the dataset
    # Define the callback
    lean_df = df.drop(columns=['entity_name'])
    dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
    # create the regeneration callback
    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset)
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=120,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=dynamic_dataset,
        tokenizer=tokenizer,
        data_collator=custom_collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[regeneration_callback]
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/esAppMod_train/simple/prediction/.gitignore
+++ b/esAppMod_train/simple/prediction/.gitignore
@ -0,0 +1 @@
 exports
--- a/esAppMod_train/simple/prediction/output.txt
+++ b/esAppMod_train/simple/prediction/output.txt
@ -0,0 +1 @@
--- a/esAppMod_train/simple/prediction/predict.py
+++ b/esAppMod_train/simple/prediction/predict.py
@ -0,0 +1,264 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # construct the target id list
 # data_path = '../../../esAppMod_data_import/train.csv'
 data_path = '../../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # Substitute digits with '#'
    # text = re.sub(r'\d+', '#', text)
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = row['mention']
        desc = preprocess_text(desc)
        index = row['entity_id']
        element = {
            'text' : desc,
            'label': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../../esAppMod_data_import/test.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # combined_data = DatasetDict({
    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
    # })
    return Dataset.from_list(process_df_to_dict(test_df))
 # %%
 def test():
    test_dataset = create_dataset()
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    # %%
    # compute max token length
    max_length = 0
    for sample in test_dataset['text']:
        # Tokenize the sample and get the length
        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
        length = len(input_ids)
        # Update max_length if this sample is longer
        if length > max_length:
            max_length = length
    print(max_length)
    # %%
    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            # truncation=True,
            padding='max_length'
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
    # %%
    # compute metrics
    # metric = evaluate.load("accuracy")
    # 
    # 
    # def compute_metrics(eval_preds):
    #     preds, labels = eval_preds
    #     preds = np.argmax(preds, axis=1)
    #     return metric.compute(predictions=preds, references=labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['label'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
    # export result
    label_list = [id2label[id] for id in pred_labels]
    df = pd.DataFrame({
        'class_prediction': pd.Series(label_list) 
    })
    # we can save the t5 generation output here
    df.to_csv(f"exports/result.csv", index=False)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
    test()
--- a/esAppMod_train/simple/train.py
+++ b/esAppMod_train/simple/train.py
@ -0,0 +1,232 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 SHUFFLES=5
 # %%
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = train_df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # introduce pre-processing functions
 def preprocess_text(text):
    # 1. Make all uppercase
    text = text.lower()
    # standardize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        # produce shuffling
        index = row['entity_id']
        parent_desc = row['mention']
        parent_desc = preprocess_text(parent_desc)
        # unaugmented data
        element = {
            'text' : parent_desc,
            'labels': label2id[index], # ensure labels starts from 0
        }
        output_list.append(element)
    return output_list
 def create_dataset():
    # train 
    data_path = '../../esAppMod_data_import/train.csv'
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
    })
    return combined_data
 # %%
 def train():
    save_path = f'checkpoint'
    split_datasets = create_dataset()
    # prepare tokenizer
    model_checkpoint = "distilbert/distilbert-base-uncased"
    # model_checkpoint = 'google-bert/bert-base-cased'
    # model_checkpoint = 'prajjwal1/bert-small'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            truncation=True,
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(target_id_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=5e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        warmup_steps=400,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()
 # %%
--- a/reference_code/dynamic_dataset_generation.py
+++ b/reference_code/dynamic_dataset_generation.py
@ -0,0 +1,188 @@
 # why?
 # the existing huggingface library does not allow for flexibility in changing
 # the training data between epochs
 # this code example illustrates the use of dataset regeneration to make changes
 # to the training data between epochs
 # %%
 from torch.utils.data import Dataset, DataLoader
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import re
 import random
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 from functools import partial
 # import matplotlib.pyplot as plt
 torch.set_float32_matmul_precision('high')
 def set_seed(seed):
    """
    Set the random seed for reproducibility.
    """
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
 set_seed(42)
 # %%
 # PARAMETERS
 SAMPLES=5
 # %%
 # import training file
 data_path = '../../esAppMod_data_import/train.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 entity_ids = df['entity_id'].to_list()
 target_id_list = sorted(list(set(entity_ids)))
 id2label = {}
 label2id = {}
 for idx, val in enumerate(target_id_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # we want to sample n samples from each class
 # sample_size refers to the number of samples per class
 def sample_from_df(df, sample_size_per_class=5):
    sampled_df = (df.groupby( "entity_id")[['entity_id', 'mention']] # explicit give column names
    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
    .reset_index(drop=True))
    return sampled_df
 # %%
 # augment whole dataset
 # for now, we just return the same df
 def augment_data(df):
    return df
 # %%
 class DynamicDataset(Dataset):
    def __init__(self, df, sample_size_per_class, tokenizer):
        """
        Args:
            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
            sample_size_per_class (int): Number of samples to draw per class for each epoch.
        """
        self.df = df
        self.sample_size_per_class = sample_size_per_class
        self.tokenizer = tokenizer
        self.current_data = None
        self.regenerate_data()  # Generate the initial dataset
    def regenerate_data(self):
        """
        Generate a new sampled dataset for the current epoch.
        dynamic callback function to regenerate data each time we call this
        method, it updates the current_data we can: 
        - re-sample the dataframe for a new set of n_samples 
        - generate fresh augmentations this effectively
        This allows us to re-sample and re-augment at the start of each epoch
        """
        # Sample `sample_size_per_class` rows per class
        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
        # perform future augmentations here
        sampled_df = augment_data(sampled_df)
        # perform tokenization here
        # Batch tokenize the entire column of data
        tokenized_batch = self.tokenizer(
            sampled_df["mention"].to_list(),  # Pass all text data at once
            truncation=True,
            # return_tensors="pt"  # disabled because pt requires equal length tensors
        )
        # Store the tokenized data with labels
        # we need to convert to torch tensors so that subsequent 'pad_sequence'
        # and 'stack' operations can work
        self.current_data = [
            {
                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
                "labels": torch.tensor(sampled_df.iloc[i]["entity_id"])  # Include the label
            }
            for i in range(len(sampled_df))
        ]
    def __len__(self):
        return len(self.current_data)
    def __getitem__(self, idx):
        return self.current_data[idx]
 # %%
 # Dynamic dataset
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=False)
 lean_df = df.drop(columns=['entity_name'])
 dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
 # %%
 # custom tokenization
 # %%
 # Example usage of dynamic dataset
 sample = dynamic_dataset[0]
 print(sample)
 # %%
 def custom_collate_fn(batch):
    # Dynamically pad tensors to the longest sequence in the batch
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])
    # Pad inputs to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }
 dataloader = DataLoader(
    dynamic_dataset,
    batch_size=32,
    collate_fn=custom_collate_fn
 )
 # %%