Implemented dynamic data re-sampling at each epoch

2025-01-16 19:41:03 +09:00 · 2025-01-16 19:41:03 +09:00 · b6cf2d4416
parent 5312cfa06f
commit b6cf2d4416
80 changed files with 7050 additions and 14 deletions
--- a/analysis/error_analysis_esAppMod.py
+++ b/analysis/error_analysis_esAppMod.py
--- a/analysis_biomedical/data_properties.py
+++ b/analysis_biomedical/data_properties.py
@ -0,0 +1,58 @@
+# %%
+import pandas as pd
+
+# %%
+#############################
+# How much data
+# data_path = '../biomedical_data_import/bc2gm_test.csv'
+# data_path = '../biomedical_data_import/bc2gm_test.csv'
+data_path = '../biomedical_data_import/bc5cdr-chemical_train.csv'
+df = pd.read_csv(data_path)
+len(df)
+
+# %%
+
+# %%
+# bc2gm:
+# train: 288939
+# test: 1034
+
+# %%
+################################
+# check for NA values
+df[df['mention'].isna()]
+
+
+
+# %%
+##############################
+# how many labels?
+data_path = '../biomedical_data_import/bc2gm_test.csv'
+df = pd.read_csv(data_path)
+
+id_list = sorted(list(set(df['entity_id'].to_list())))
+
+# %%
+len(id_list)
+
+# %%
+for id in id_list:
+    if isinstance(id,int):
+        continue
+    else:
+        print(id)
+# %%
+# bc2gm:
+# 61641 - holy shit
+
+# %%
+###############################
+# max length
+max_length = 0
+for mention in df['mention']:
+    current_length = len(mention)
+    if current_length > max_length:
+        max_length = current_length
+print(max_length)
+
+# %%
--- a/analysis_biomedical/measure_tokenization_length.py
+++ b/analysis_biomedical/measure_tokenization_length.py
@ -0,0 +1,17 @@
+# %%
+from transformers import AutoTokenizer
+import pandas as pd
+
+
+data_path = '../biomedical_data_import/bc2gm_train.csv'
+df = pd.DataFrame(data_path)
+
+# Load the tokenizer (e.g., BERT tokenizer)
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+# %%
+# Calculate token lengths
+df['token_length'] = df['mention'].apply(lambda x: len(tokenizer.tokenize(x)))
+
+# Display the dataset with token lengths
+print(df)
--- a/biomedical_data_import/.gitignore
+++ b/biomedical_data_import/.gitignore
@ -0,0 +1 @@
+*.csv
--- a/biomedical_data_import/original_data_processing.py
+++ b/biomedical_data_import/original_data_processing.py
@ -0,0 +1,36 @@
+# %%
+from collections import defaultdict
+
+# %%
+data_name = 'bc2gm' # and the other 3 names
+train_path = 'test_dictionary.txt'
+test_path = 'processed_test_refined'
+
+# %%
+vocab = defaultdict(set)
+with open(f'../biomedical/{data_name}/{train_path}') as f:
+    for line in f:            
+        term_list = line.strip().split('||')
+        vocab[term_list[0]].add(term_list[1].lower())
+
+cui_to_id, id_to_cui = {}, {}
+vocab_entity_id_mentions = {}
+for id, cui in enumerate(vocab):
+    cui_to_id[cui] = id
+    id_to_cui[id] = cui
+for cui, mention in vocab.items():
+    vocab_entity_id_mentions[cui_to_id[cui]] = mention
+
+vocab_mentions, vocab_ids = [], []
+for id, mentions in vocab_entity_id_mentions.items():
+    vocab_mentions.extend(mentions)
+    vocab_ids.extend([id]*len(mentions))
+
+# %%
+test_mentions, test_cuis = [], []
+with open(f'../biomedical/{data_name}/{test_path}/0.concept') as f:
+    for line in f:            
+        term_list = line.strip().split('||')
+        test_cuis.append(term_list[-1])
+        test_mentions.append(term_list[-2].lower())
+
--- a/biomedical_data_import/process_to_df.py
+++ b/biomedical_data_import/process_to_df.py
@ -0,0 +1,134 @@
+# %%
+import pandas as pd
+from tqdm import tqdm
+import multiprocessing
+
+# %%
+#########################
+# we first process training data
+def process_train_to_csv(data_path, output):
+    # data_path = '../esAppMod_data_import/parent_train.csv'
+    input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
+    input_df = input_df.rename(columns={0: 'entity_id', 1: 'mention',})
+
+    # handle 'or' values in the number column
+    df = input_df.copy()
+    new_rows = []
+    for idx,row in df.iterrows():
+        index = row['entity_id']
+        mention = row['mention']
+
+        # omit nan values
+        if row['mention'] == 'NaN' or pd.isna(row['mention']):
+            df = df.drop(index=[idx])
+            continue
+
+        # handle possible multiple indices in index field
+        if '|' in row['entity_id']:
+            # print(row[0])
+            df = df.drop(index=[idx])
+            index_list = index.split('|')
+
+            for new_index in index_list:
+                element = {
+                    'entity_id': new_index,
+                    'mention': mention,
+                }
+                new_rows.append(element)
+                
+    df_new = pd.DataFrame(new_rows, columns=df.columns)
+    df = pd.concat([df, df_new], ignore_index=True)
+    df = df.reset_index(drop=True)
+
+    df.to_csv(output, index=False)
+
+
+# %%
+name_list =[
+    ('../biomedical/bc2gm/test_dictionary.txt', 'bc2gm_train.csv'),
+    ('../biomedical/bc5cdr-chemical/test_dictionary.txt', 'bc5cdr-chemical_train.csv'),
+    ('../biomedical/bc5cdr-disease/test_dictionary.txt', 'bc5cdr-disease_train.csv'),
+    ('../biomedical/ncbi/test_dictionary.txt', 'ncbi_train.csv'),
+]
+
+# for data_path, output in name_list:
+#     process_train_to_csv(data_path, output)
+
+if __name__ == "__main__":
+    # Create a pool of workers
+    num_workers = 4  # set number of cpus to use
+    with multiprocessing.Pool(num_workers) as pool:
+        # starmap
+        # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
+        pool.starmap(process_train_to_csv, name_list)
+
+
+# %%
+#################################################
+# process test data
+
+def is_int_string(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+def process_test_to_csv(data_path, output):
+    # data_path = '../esAppMod_data_import/parent_train.csv'
+    input_df = pd.read_csv(data_path, sep=f'\|\|', engine='python', skipinitialspace=True, header=None)
+    input_df = input_df.drop(columns=[0, 1, 2])
+    input_df = input_df.rename(columns={3: 'mention', 4: 'entity_id'})
+
+    # handle 'or' values in the number column
+    df = input_df.copy()
+    new_rows = []
+    for idx,row in df.iterrows():
+
+        # handle possible multiple indices
+        if '|' in row['entity_id']:
+            index = row['entity_id']
+            mention = row['mention']
+            df = df.drop(index=[idx])
+            index_list = index.split('|')
+
+            for new_index in index_list:
+                element = {
+                    'entity_id': new_index,
+                    'mention': mention,
+                }
+                new_rows.append(element)
+                
+    df_new = pd.DataFrame(new_rows, columns=df.columns)
+    df = pd.concat([df, df_new], ignore_index=True)
+    df = df.reset_index(drop=True)
+
+    # do some cleanup
+    df['entity_id'].isna()
+
+    df.to_csv(output, index=False)
+
+
+# %%
+name_list =[
+    ('../biomedical/bc2gm/processed_test_refined/0.concept', 'bc2gm_test.csv'),
+    ('../biomedical/bc5cdr-chemical/processed_test_refined/0.concept', 'bc5cdr-chemical_test.csv'),
+    ('../biomedical/bc5cdr-disease/processed_test_refined/0.concept', 'bc5cdr-disease_test.csv'),
+    ('../biomedical/ncbi/processed_test_refined/0.concept', 'ncbi_test.csv'),
+]
+
+# for data_path, output in name_list:
+#     process_test_to_csv(data_path, output)
+if __name__ == "__main__":
+    # Create a pool of workers
+    num_workers = 4  # set number of cpus to use
+    with multiprocessing.Pool(num_workers) as pool:
+        # starmap
+        # an iterable of [(1,2), (3, 4)] results in [func(1,2), func(3,4)].
+        pool.starmap(process_test_to_csv, name_list)
+
+
+
+# %%
+
+# %%
--- a/biomedical_train/bc2gm/augmentation/.gitignore
+++ b/biomedical_train/bc2gm/augmentation/.gitignore
--- a/biomedical_train/bc2gm/augmentation/dynamic_train.py
+++ b/biomedical_train/bc2gm/augmentation/dynamic_train.py
@ -0,0 +1,388 @@
+# %%
+from torch.utils.data import Dataset, DataLoader
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments,
+    TrainerCallback
+)
+import evaluate
+import numpy as np
+import pandas as pd
+import math
+from functools import partial
+import warnings
+
+warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
+warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
+
+# import matplotlib.pyplot as plt
+
+
+
+torch.set_float32_matmul_precision('high')
+
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+# %%
+# PARAMETERS
+SAMPLES=20
+SHUFFLES=5
+AMPLIFY_FACTOR=5
+
+# %%
+###################################################
+# import code
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+df["training_id"] = df["entity_id"].map(label2id)
+
+# %%
+##############################################################
+# augmentation code
+
+# basic preprocessing
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+
+
+# %%
+def create_example(index, mention):
+    return {'training_id': index, 'mention': mention}
+
+# augment whole dataset
+def augment_data(df):
+    output_list = []
+
+    for idx,row in df.iterrows():
+        index = row['training_id']
+        parent_desc = row['mention']
+        parent_desc = preprocess_text(parent_desc) 
+
+        # add basic example
+        output_list.append(create_example(index, parent_desc))
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                output_list.append(create_example(index, desc))
+        
+        # add corrupted strings
+        desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        if (desc != parent_desc):
+            output_list.append(create_example(index, desc))
+
+        # add example with stripped non-alphanumerics
+        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        if (desc != parent_desc):
+            output_list.append(create_example(index, desc))
+
+        # short sequence amplifier
+        # short sequences are rare, and we must compensate by including more examples
+        # also, short sequence don't usually get affected by shuffle
+        words = parent_desc.split()
+        word_count = len(words)
+        if word_count <= 2:
+            for _ in range(AMPLIFY_FACTOR):
+                output_list.append(create_example(index, desc))
+
+    new_df = pd.DataFrame(output_list)
+    return new_df
+
+
+###############################################################
+# regeneration code
+# %%
+# we want to sample n samples from each class
+# sample_size refers to the number of samples per class
+def sample_from_df(df, sample_size_per_class=5):
+    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
+    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
+    .reset_index(drop=True))
+
+    return sampled_df
+
+
+
+# %%
+class DynamicDataset(Dataset):
+    def __init__(self, df, sample_size_per_class, tokenizer):
+        """
+        Args:
+            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
+            sample_size_per_class (int): Number of samples to draw per class for each epoch.
+        """
+        self.df = df
+        self.sample_size_per_class = sample_size_per_class
+        self.tokenizer = tokenizer
+        self.current_data = None
+        self.regenerate_data()  # Generate the initial dataset
+
+    def regenerate_data(self):
+        """
+        Generate a new sampled dataset for the current epoch.
+
+        dynamic callback function to regenerate data each time we call this
+        method, it updates the current_data we can: 
+            
+        - re-sample the dataframe for a new set of n_samples 
+        - generate fresh augmentations this effectively
+
+        This allows us to re-sample and re-augment at the start of each epoch
+        """
+        # Sample `sample_size_per_class` rows per class
+        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
+        
+        # perform future edits here
+        sampled_df = augment_data(sampled_df)
+
+        # perform tokenization here
+        # Batch tokenize the entire column of data
+        tokenized_batch = self.tokenizer(
+            sampled_df["mention"].to_list(),  # Pass all text data at once
+            truncation=True,
+            # return_tensors="pt"  # disabled because pt requires equal length tensors
+        )
+
+        # Store the tokenized data with labels
+        self.current_data = [
+            {
+                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
+                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
+                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
+            }
+            for i in range(len(sampled_df))
+        ]
+
+
+    def __len__(self):
+        return len(self.current_data)
+
+    def __getitem__(self, idx):
+        return self.current_data[idx]
+
+# %%
+class RegenerateDatasetCallback(TrainerCallback):
+    def __init__(self, dataset):
+        self.dataset = dataset
+
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        print(f"Epoch {int(math.ceil(state.epoch + 1))}: Regenerating dataset")
+        self.dataset.regenerate_data()
+
+
+
+# %%
+def custom_collate_fn(batch):
+    # Dynamically pad tensors to the longest sequence in the batch
+    input_ids = [item["input_ids"] for item in batch]
+    attention_masks = [item["attention_mask"] for item in batch]
+    labels = torch.stack([item["labels"] for item in batch])
+
+    # Pad inputs to the same length
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
+    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_masks,
+        "labels": labels
+    }
+
+
+##########################################################################
+# training code
+# %%
+def train():
+
+    save_path = f'checkpoint'
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
+
+    # make the dataset
+
+
+    # Define the callback
+    lean_df = df.drop(columns=['entity_name'])
+    dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
+
+    # create the regeneration callback
+    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset)
+
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        save_strategy="steps",
+        save_steps=500,
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=120,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=dynamic_dataset,
+        tokenizer=tokenizer,
+        data_collator=custom_collate_fn,
+        compute_metrics=compute_metrics,
+        callbacks=[regeneration_callback]
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/biomedical_train/bc2gm/augmentation/prediction/.gitignore
+++ b/biomedical_train/bc2gm/augmentation/prediction/.gitignore
--- a/biomedical_train/bc2gm/augmentation/prediction/output.txt
+++ b/biomedical_train/bc2gm/augmentation/prediction/output.txt
@ -1,6 +1,6 @@

 *******************************************************************************
-Accuracy: 0.80197
-F1 Score: 0.81948
-Precision: 0.88067
-Recall: 0.80197
+Accuracy: 0.80655
+F1 Score: 0.82821
+Precision: 0.87847
+Recall: 0.80655
--- a/biomedical_train/bc2gm/augmentation/prediction/predict.py
+++ b/biomedical_train/bc2gm/augmentation/prediction/predict.py
@ -0,0 +1,236 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 32
+
+# %%
+# construct the target id list
+data_path = '../../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+# target_id_list = [id for id in target_id_list]
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        row_id = row['entity_id']
+        element = {
+            'text' : desc,
+            'labels': label2id[row_id], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../biomedical_data_import/bc2gm_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'test': Dataset.from_list(process_df_to_dict(test_df)),
+    })
+    return combined_data
+
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # %%
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+    # print datasets['test'] columns
+    column_info = datasets['test'].features
+    for column, dtype in column_info.items():
+        print(f"Column: {column}, Type: {dtype}")
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    dataloader = DataLoader(
+        datasets['test'],
+        batch_size=BATCH_SIZE, 
+        shuffle=False,
+        collate_fn=data_collator)
+
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/biomedical_train/bc2gm/augmentation/train.py
+++ b/biomedical_train/bc2gm/augmentation/train.py
@ -0,0 +1,367 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0  # 0 shuffles means it does not re-sample
+
+# %%
+
+# We want to map the entity_id to a consecutive set of id's
+# import training file
+data_path = '../../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+#############################################################
+# Data Run code here
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        if isinstance(parent_desc, float):
+            print(parent_desc)
+            parent_desc = f'{parent_desc}'
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+        # # short sequences are rare, and we must compensate by including more examples
+        # # mutation of other longer sequences might drown out rare short sequences
+        # words = parent_desc.split()
+        # word_count = len(words)
+        # if word_count < 3:
+        #     for _ in range(10):
+        #         element = {
+        #             'text': parent_desc,
+        #             'label': label2id[index],
+        #         }
+        #         output_list.append(element)
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # # corrupt string
+        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        
+        # # augmentation
+        # # remove all non-alphanumerics
+        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+
+    data_path = '../../../biomedical_data_import/bc2gm_train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+#########################################
+# training function
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True, # enable truncation for efficiency
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text", # we only need the tokenization, not the original strings
+    )
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=512,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator, # data_collator performs dynamic padding
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/biomedical_train/bc2gm/simple/.gitignore
+++ b/biomedical_train/bc2gm/simple/.gitignore
--- a/biomedical_train/bc2gm/simple/dynamic_train.py
+++ b/biomedical_train/bc2gm/simple/dynamic_train.py
@ -0,0 +1,280 @@
+# %%
+from torch.utils.data import Dataset, DataLoader
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments,
+    TrainerCallback
+)
+import evaluate
+import numpy as np
+import pandas as pd
+from functools import partial
+import warnings
+
+warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
+warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
+
+# import matplotlib.pyplot as plt
+
+
+
+torch.set_float32_matmul_precision('high')
+
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+# %%
+# PARAMETERS
+SAMPLES=20
+
+# %%
+###################################################
+# import code
+# import training file
+data_path = '../../../biomedical_data_import/bc2gm_train.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+df["training_id"] = df["entity_id"].map(label2id)
+
+###############################################################
+# regeneration code
+# %%
+# we want to sample n samples from each class
+# sample_size refers to the number of samples per class
+def sample_from_df(df, sample_size_per_class=5):
+    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
+    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
+    .reset_index(drop=True))
+
+    return sampled_df
+
+
+# %%
+# augment whole dataset
+# for now, we just return the same df
+def augment_data(df):
+    return df
+
+# %%
+class DynamicDataset(Dataset):
+    def __init__(self, df, sample_size_per_class, tokenizer):
+        """
+        Args:
+            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
+            sample_size_per_class (int): Number of samples to draw per class for each epoch.
+        """
+        self.df = df
+        self.sample_size_per_class = sample_size_per_class
+        self.tokenizer = tokenizer
+        self.current_data = None
+        self.regenerate_data()  # Generate the initial dataset
+
+    def regenerate_data(self):
+        """
+        Generate a new sampled dataset for the current epoch.
+
+        dynamic callback function to regenerate data each time we call this
+        method, it updates the current_data we can: 
+            
+        - re-sample the dataframe for a new set of n_samples 
+        - generate fresh augmentations this effectively
+
+        This allows us to re-sample and re-augment at the start of each epoch
+        """
+        # Sample `sample_size_per_class` rows per class
+        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
+        
+        # perform future edits here
+        sampled_df = augment_data(sampled_df)
+
+        # perform tokenization here
+        # Batch tokenize the entire column of data
+        tokenized_batch = self.tokenizer(
+            sampled_df["mention"].to_list(),  # Pass all text data at once
+            truncation=True,
+            # return_tensors="pt"  # disabled because pt requires equal length tensors
+        )
+
+        # Store the tokenized data with labels
+        self.current_data = [
+            {
+                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
+                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
+                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
+            }
+            for i in range(len(sampled_df))
+        ]
+
+
+    def __len__(self):
+        return len(self.current_data)
+
+    def __getitem__(self, idx):
+        return self.current_data[idx]
+
+# %%
+class RegenerateDatasetCallback(TrainerCallback):
+    def __init__(self, dataset, every_n_epochs=2):
+        """
+        Args:
+            dataset: The dataset instance that supports regeneration.
+            every_n_epochs (int): Number of epochs to wait before regenerating the dataset.
+        """
+        self.dataset = dataset
+        self.every_n_epochs = every_n_epochs
+
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        # Check if the current epoch is a multiple of `every_n_epochs`
+        if (state.epoch + 1) % self.every_n_epochs == 0:
+            print(f"Epoch {int(state.epoch + 1)}: Regenerating dataset...")
+            self.dataset.regenerate_data()
+
+
+# %%
+def custom_collate_fn(batch):
+    # Dynamically pad tensors to the longest sequence in the batch
+    input_ids = [item["input_ids"] for item in batch]
+    attention_masks = [item["attention_mask"] for item in batch]
+    labels = torch.stack([item["labels"] for item in batch])
+
+    # Pad inputs to the same length
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
+    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_masks,
+        "labels": labels
+    }
+
+
+##########################################################################
+# training code
+# %%
+def train():
+
+    save_path = f'checkpoint'
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
+
+    # make the dataset
+
+
+    # Define the callback
+    # lean_df = df.drop(columns=['entity_name'])
+    dynamic_dataset = DynamicDataset(df = df, sample_size_per_class=SAMPLES, tokenizer=tokenizer)
+
+    # create the regeneration callback
+    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset, every_n_epochs=2)
+
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-4,
+        per_device_train_batch_size=256,
+        # per_device_eval_batch_size=256,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=200,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=dynamic_dataset,
+        tokenizer=tokenizer,
+        data_collator=custom_collate_fn,
+        compute_metrics=compute_metrics,
+        callbacks=[regeneration_callback]
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/biomedical_train/bc2gm/simple/prediction/.gitignore
+++ b/biomedical_train/bc2gm/simple/prediction/.gitignore
--- a/biomedical_train/bc2gm/simple/prediction/output.txt
+++ b/biomedical_train/bc2gm/simple/prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.15093
+F1 Score: 0.14063
+Precision: 0.15594
+Recall: 0.15093
--- a/biomedical_train/bc2gm/simple/prediction/predict.py
+++ b/biomedical_train/bc2gm/simple/prediction/predict.py
@ -0,0 +1,246 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 32
+
+# %%
+# construct the target id list
+data_path = '../../../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+# target_id_list = [id for id in target_id_list]
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def is_int_string(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        row_id = row['entity_id']
+        if not is_int_string(row_id):
+            continue
+        row_id = int(row_id)
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        element = {
+            'text' : desc,
+            'labels': label2id[row_id], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../../biomedical_data_import/bc2gm_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'test': Dataset.from_list(process_df_to_dict(test_df)),
+    })
+    return combined_data
+
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # %%
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+    # print datasets['test'] columns
+    column_info = datasets['test'].features
+    for column, dtype in column_info.items():
+        print(f"Column: {column}, Type: {dtype}")
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    dataloader = DataLoader(
+        datasets['test'],
+        batch_size=BATCH_SIZE, 
+        shuffle=False,
+        collate_fn=data_collator)
+
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/biomedical_train/bc2gm/simple/train.py
+++ b/biomedical_train/bc2gm/simple/train.py
@ -0,0 +1,368 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0  # 0 shuffles means it does not re-sample
+
+# %%
+
+# We want to map the entity_id to a consecutive set of id's
+# import training file
+data_path = '../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+#############################################################
+# Data Run code here
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        if isinstance(parent_desc, float):
+            print(parent_desc)
+            parent_desc = f'{parent_desc}'
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+        # # short sequences are rare, and we must compensate by including more examples
+        # # mutation of other longer sequences might drown out rare short sequences
+        # words = parent_desc.split()
+        # word_count = len(words)
+        # if word_count < 3:
+        #     for _ in range(10):
+        #         element = {
+        #             'text': parent_desc,
+        #             'label': label2id[index],
+        #         }
+        #         output_list.append(element)
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # # corrupt string
+        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        
+        # # augmentation
+        # # remove all non-alphanumerics
+        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+
+    data_path = '../../biomedical_data_import/bc2gm_train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+#########################################
+# training function
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+    # max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True, # enable truncation for efficiency
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text", # we only need the tokenization, not the original strings
+    )
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=512,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator, # data_collator performs dynamic padding
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/biomedical_train/bc5cdr-chemical/augmentation/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/augmentation/.gitignore
--- a/biomedical_train/bc5cdr-chemical/augmentation/prediction/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/.gitignore
--- a/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt
+++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/output.txt
@ -0,0 +1 @@
+
--- a/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py
+++ b/biomedical_train/bc5cdr-chemical/augmentation/prediction/predict.py
@ -0,0 +1,236 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+# target_id_list = [id for id in target_id_list]
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        row_id = row['entity_id']
+        element = {
+            'text' : desc,
+            'labels': label2id[row_id], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../biomedical_data_import/bc5cdr-chemical_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'test': Dataset.from_list(process_df_to_dict(test_df)),
+    })
+    return combined_data
+
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # %%
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+    # print datasets['test'] columns
+    column_info = datasets['test'].features
+    for column, dtype in column_info.items():
+        print(f"Column: {column}, Type: {dtype}")
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    dataloader = DataLoader(
+        datasets['test'],
+        batch_size=BATCH_SIZE, 
+        shuffle=False,
+        collate_fn=data_collator)
+
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/biomedical_train/bc5cdr-chemical/augmentation/train.py
+++ b/biomedical_train/bc5cdr-chemical/augmentation/train.py
@ -0,0 +1,368 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0  # 0 shuffles means it does not re-sample
+
+# %%
+
+# We want to map the entity_id to a consecutive set of id's
+# import training file
+data_path = '../../biomedical_data_import/bc5cdr-chemical_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+#############################################################
+# Data Run code here
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        if isinstance(parent_desc, float):
+            print(parent_desc)
+            parent_desc = f'{parent_desc}'
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'labels': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+        # # short sequences are rare, and we must compensate by including more examples
+        # # mutation of other longer sequences might drown out rare short sequences
+        # words = parent_desc.split()
+        # word_count = len(words)
+        # if word_count < 3:
+        #     for _ in range(10):
+        #         element = {
+        #             'text': parent_desc,
+        #             'labels': label2id[index],
+        #         }
+        #         output_list.append(element)
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'labels': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # # corrupt string
+        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'labels': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        
+        # # augmentation
+        # # remove all non-alphanumerics
+        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'labels': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+
+    data_path = '../../biomedical_data_import/bc5cdr-chemical.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+#########################################
+# training function
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+    # max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True, # enable truncation for efficiency
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text", # we only need the tokenization, not the original strings
+    )
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=512,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator, # data_collator performs dynamic padding
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/biomedical_train/bc5cdr-chemical/simple/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/simple/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore
+++ b/biomedical_train/bc5cdr-chemical/simple/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt
+++ b/biomedical_train/bc5cdr-chemical/simple/prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.04872
+F1 Score: 0.04283
+Precision: 0.04903
+Recall: 0.04872
--- a/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py
+++ b/biomedical_train/bc5cdr-chemical/simple/prediction/predict.py
@ -0,0 +1,234 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 32
+
+# %%
+# construct the target id list
+data_path = '../../../../biomedical_data_import/bc5cdr-chemical_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+# target_id_list = [id for id in target_id_list]
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        row_id = row['entity_id']
+        element = {
+            'text' : desc,
+            'labels': label2id[row_id], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../../biomedical_data_import/bc5cdr-chemical_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'test': Dataset.from_list(process_df_to_dict(test_df)),
+    })
+    return combined_data
+
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # %%
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+    column_info = datasets['test'].features
+    for column, dtype in column_info.items():
+        print(f"Column: {column}, Type: {dtype}")
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    dataloader = DataLoader(
+        datasets['test'],
+        batch_size=BATCH_SIZE, 
+        shuffle=False,
+        collate_fn=data_collator)
+
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/biomedical_train/bc5cdr-chemical/simple/train.py
+++ b/biomedical_train/bc5cdr-chemical/simple/train.py
@ -0,0 +1,367 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0  # 0 shuffles means it does not re-sample
+
+# %%
+
+# We want to map the entity_id to a consecutive set of id's
+# import training file
+data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+#############################################################
+# Data Run code here
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        if isinstance(parent_desc, float):
+            print(parent_desc)
+            parent_desc = f'{parent_desc}'
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'labels': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+        # # short sequences are rare, and we must compensate by including more examples
+        # # mutation of other longer sequences might drown out rare short sequences
+        # words = parent_desc.split()
+        # word_count = len(words)
+        # if word_count < 3:
+        #     for _ in range(10):
+        #         element = {
+        #             'text': parent_desc,
+        #             'labels': label2id[index],
+        #         }
+        #         output_list.append(element)
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'labels': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # # corrupt string
+        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'labels': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        
+        # # augmentation
+        # # remove all non-alphanumerics
+        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'labels': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+
+    data_path = '../../../biomedical_data_import/bc5cdr-chemical_train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+#########################################
+# training function
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+    # max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True, # enable truncation for efficiency
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text", # we only need the tokenization, not the original strings
+    )
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator, # data_collator performs dynamic padding
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
--- a/biomedical_train/bc5cdr-disease/.gitignore
+++ b/biomedical_train/bc5cdr-disease/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/biomedical_train/bc5cdr-disease/prediction/.gitignore
+++ b/biomedical_train/bc5cdr-disease/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/biomedical_train/bc5cdr-disease/prediction/output.txt
+++ b/biomedical_train/bc5cdr-disease/prediction/output.txt
@ -0,0 +1 @@
+
--- a/biomedical_train/bc5cdr-disease/prediction/predict.py
+++ b/biomedical_train/bc5cdr-disease/prediction/predict.py
@ -0,0 +1,236 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+data_path = '../../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+# target_id_list = [id for id in target_id_list]
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        row_id = row['entity_id']
+        element = {
+            'text' : desc,
+            'labels': label2id[row_id], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../biomedical_data_import/bc2gm_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'test': Dataset.from_list(process_df_to_dict(test_df)),
+    })
+    return combined_data
+
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # %%
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+    # print datasets['test'] columns
+    column_info = datasets['test'].features
+    for column, dtype in column_info.items():
+        print(f"Column: {column}, Type: {dtype}")
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    dataloader = DataLoader(
+        datasets['test'],
+        batch_size=BATCH_SIZE, 
+        shuffle=False,
+        collate_fn=data_collator)
+
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/biomedical_train/bc5cdr-disease/train.py
+++ b/biomedical_train/bc5cdr-disease/train.py
@ -0,0 +1,368 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0  # 0 shuffles means it does not re-sample
+
+# %%
+
+# We want to map the entity_id to a consecutive set of id's
+# import training file
+data_path = '../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+#############################################################
+# Data Run code here
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        if isinstance(parent_desc, float):
+            print(parent_desc)
+            parent_desc = f'{parent_desc}'
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+        # # short sequences are rare, and we must compensate by including more examples
+        # # mutation of other longer sequences might drown out rare short sequences
+        # words = parent_desc.split()
+        # word_count = len(words)
+        # if word_count < 3:
+        #     for _ in range(10):
+        #         element = {
+        #             'text': parent_desc,
+        #             'label': label2id[index],
+        #         }
+        #         output_list.append(element)
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # # corrupt string
+        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        
+        # # augmentation
+        # # remove all non-alphanumerics
+        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+
+    data_path = '../../biomedical_data_import/bc2gm_train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+#########################################
+# training function
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+    # max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True, # enable truncation for efficiency
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text", # we only need the tokenization, not the original strings
+    )
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=512,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator, # data_collator performs dynamic padding
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/biomedical_train/ncbi/.gitignore
+++ b/biomedical_train/ncbi/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/biomedical_train/ncbi/prediction/.gitignore
+++ b/biomedical_train/ncbi/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/biomedical_train/ncbi/prediction/output.txt
+++ b/biomedical_train/ncbi/prediction/output.txt
@ -0,0 +1 @@
+
--- a/biomedical_train/ncbi/prediction/predict.py
+++ b/biomedical_train/ncbi/prediction/predict.py
@ -0,0 +1,236 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+data_path = '../../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+# target_id_list = [id for id in target_id_list]
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        row_id = row['entity_id']
+        element = {
+            'text' : desc,
+            'labels': label2id[row_id], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../biomedical_data_import/bc2gm_test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'test': Dataset.from_list(process_df_to_dict(test_df)),
+    })
+    return combined_data
+
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # %%
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+    # print datasets['test'] columns
+    column_info = datasets['test'].features
+    for column, dtype in column_info.items():
+        print(f"Column: {column}, Type: {dtype}")
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    dataloader = DataLoader(
+        datasets['test'],
+        batch_size=BATCH_SIZE, 
+        shuffle=False,
+        collate_fn=data_collator)
+
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/biomedical_train/ncbi/train.py
+++ b/biomedical_train/ncbi/train.py
@ -0,0 +1,368 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=0  # 0 shuffles means it does not re-sample
+
+# %%
+
+# We want to map the entity_id to a consecutive set of id's
+# import training file
+data_path = '../../biomedical_data_import/bc2gm_train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+#############################################################
+# Data Run code here
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        if isinstance(parent_desc, float):
+            print(parent_desc)
+            parent_desc = f'{parent_desc}'
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+        # # short sequences are rare, and we must compensate by including more examples
+        # # mutation of other longer sequences might drown out rare short sequences
+        # words = parent_desc.split()
+        # word_count = len(words)
+        # if word_count < 3:
+        #     for _ in range(10):
+        #         element = {
+        #             'text': parent_desc,
+        #             'label': label2id[index],
+        #         }
+        #         output_list.append(element)
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'label': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # # corrupt string
+        # desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        
+        # # augmentation
+        # # remove all non-alphanumerics
+        # desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+
+    data_path = '../../biomedical_data_import/bc2gm_train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+#########################################
+# training function
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+    # max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True, # enable truncation for efficiency
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text", # we only need the tokenization, not the original strings
+    )
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=512,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator, # data_collator performs dynamic padding
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/esAppMod_train/augmentation/.gitignore
+++ b/esAppMod_train/augmentation/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/esAppMod_train/augmentation/dynamic_train.py
+++ b/esAppMod_train/augmentation/dynamic_train.py
@ -0,0 +1,388 @@
+# %%
+from torch.utils.data import Dataset, DataLoader
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments,
+    TrainerCallback
+)
+import evaluate
+import numpy as np
+import pandas as pd
+import math
+from functools import partial
+import warnings
+
+warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
+warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
+
+# import matplotlib.pyplot as plt
+
+
+
+torch.set_float32_matmul_precision('high')
+
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+# %%
+# PARAMETERS
+SAMPLES=20
+SHUFFLES=5
+AMPLIFY_FACTOR=5
+
+# %%
+###################################################
+# import code
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+df["training_id"] = df["entity_id"].map(label2id)
+
+# %%
+##############################################################
+# augmentation code
+
+# basic preprocessing
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+
+
+# %%
+def create_example(index, mention):
+    return {'training_id': index, 'mention': mention}
+
+# augment whole dataset
+def augment_data(df):
+    output_list = []
+
+    for idx,row in df.iterrows():
+        index = row['training_id']
+        parent_desc = row['mention']
+        parent_desc = preprocess_text(parent_desc) 
+
+        # add basic example
+        output_list.append(create_example(index, parent_desc))
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                output_list.append(create_example(index, desc))
+        
+        # add corrupted strings
+        desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        if (desc != parent_desc):
+            output_list.append(create_example(index, desc))
+
+        # add example with stripped non-alphanumerics
+        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        if (desc != parent_desc):
+            output_list.append(create_example(index, desc))
+
+        # short sequence amplifier
+        # short sequences are rare, and we must compensate by including more examples
+        # also, short sequence don't usually get affected by shuffle
+        words = parent_desc.split()
+        word_count = len(words)
+        if word_count <= 2:
+            for _ in range(AMPLIFY_FACTOR):
+                output_list.append(create_example(index, desc))
+
+    new_df = pd.DataFrame(output_list)
+    return new_df
+
+
+###############################################################
+# regeneration code
+# %%
+# we want to sample n samples from each class
+# sample_size refers to the number of samples per class
+def sample_from_df(df, sample_size_per_class=5):
+    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
+    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
+    .reset_index(drop=True))
+
+    return sampled_df
+
+
+
+# %%
+class DynamicDataset(Dataset):
+    def __init__(self, df, sample_size_per_class, tokenizer):
+        """
+        Args:
+            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
+            sample_size_per_class (int): Number of samples to draw per class for each epoch.
+        """
+        self.df = df
+        self.sample_size_per_class = sample_size_per_class
+        self.tokenizer = tokenizer
+        self.current_data = None
+        self.regenerate_data()  # Generate the initial dataset
+
+    def regenerate_data(self):
+        """
+        Generate a new sampled dataset for the current epoch.
+
+        dynamic callback function to regenerate data each time we call this
+        method, it updates the current_data we can: 
+            
+        - re-sample the dataframe for a new set of n_samples 
+        - generate fresh augmentations this effectively
+
+        This allows us to re-sample and re-augment at the start of each epoch
+        """
+        # Sample `sample_size_per_class` rows per class
+        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
+        
+        # perform future edits here
+        sampled_df = augment_data(sampled_df)
+
+        # perform tokenization here
+        # Batch tokenize the entire column of data
+        tokenized_batch = self.tokenizer(
+            sampled_df["mention"].to_list(),  # Pass all text data at once
+            truncation=True,
+            # return_tensors="pt"  # disabled because pt requires equal length tensors
+        )
+
+        # Store the tokenized data with labels
+        self.current_data = [
+            {
+                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
+                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
+                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
+            }
+            for i in range(len(sampled_df))
+        ]
+
+
+    def __len__(self):
+        return len(self.current_data)
+
+    def __getitem__(self, idx):
+        return self.current_data[idx]
+
+# %%
+class RegenerateDatasetCallback(TrainerCallback):
+    def __init__(self, dataset):
+        self.dataset = dataset
+
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        print(f"Epoch {int(math.ceil(state.epoch + 1))}: Regenerating dataset")
+        self.dataset.regenerate_data()
+
+
+
+# %%
+def custom_collate_fn(batch):
+    # Dynamically pad tensors to the longest sequence in the batch
+    input_ids = [item["input_ids"] for item in batch]
+    attention_masks = [item["attention_mask"] for item in batch]
+    labels = torch.stack([item["labels"] for item in batch])
+
+    # Pad inputs to the same length
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
+    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_masks,
+        "labels": labels
+    }
+
+
+##########################################################################
+# training code
+# %%
+def train():
+
+    save_path = f'checkpoint'
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
+
+    # make the dataset
+
+
+    # Define the callback
+    lean_df = df.drop(columns=['entity_name'])
+    dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=SAMPLES, tokenizer=tokenizer)
+
+    # create the regeneration callback
+    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset)
+
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        save_strategy="steps",
+        save_steps=500,
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        # per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=120,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=dynamic_dataset,
+        tokenizer=tokenizer,
+        data_collator=custom_collate_fn,
+        compute_metrics=compute_metrics,
+        callbacks=[regeneration_callback]
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/esAppMod_train/augmentation/prediction/.gitignore
+++ b/esAppMod_train/augmentation/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/esAppMod_train/augmentation/prediction/output.txt
+++ b/esAppMod_train/augmentation/prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.76958
+F1 Score: 0.79382
+Precision: 0.88705
+Recall: 0.76958
--- a/train/class_bert_augmentation/prediction/predict.py
+++ b/train/class_bert_augmentation/prediction/predict.py
--- a/esAppMod_train/class_bert_augmentation/.gitignore
+++ b/esAppMod_train/class_bert_augmentation/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/esAppMod_train/class_bert_augmentation/prediction/.gitignore
+++ b/esAppMod_train/class_bert_augmentation/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/esAppMod_train/class_bert_augmentation/prediction/output.txt
+++ b/esAppMod_train/class_bert_augmentation/prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.80689
+F1 Score: 0.82527
+Precision: 0.89684
+Recall: 0.80689
--- a/esAppMod_train/class_bert_augmentation/prediction/predict.py
+++ b/esAppMod_train/class_bert_augmentation/prediction/predict.py
@ -0,0 +1,264 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+# data_path = '../../../esAppMod_data_import/train.csv'
+data_path = '../../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        index = row['entity_id']
+        element = {
+            'text' : desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../esAppMod_data_import/test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/esAppMod_train/class_bert_augmentation/train.py
+++ b/esAppMod_train/class_bert_augmentation/train.py
@ -0,0 +1,558 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=5
+
+# %%
+
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with 'x'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    # add the original text
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+acronym_mapping = {
+ 'hpsa': 'hp server automation',
+ 'tam': 'tivoli access manager',
+ 'adf': 'application development facility',
+ 'html': 'hypertext markup language',
+ 'wff': 'microsoft web farm framework',
+ 'jsp': 'javaserver pages',
+ 'bw': 'business works',
+ 'ssrs': 'sql server reporting services',
+ 'cl': 'control language',
+ 'vba': 'visual basic for applications',
+ 'esapi': 'enterprise security api',
+ 'gwt': 'google web toolkit',
+ 'pki': 'perkin elmer informatics',
+ 'rtd': 'oracle realtime decisions',
+ 'jms': 'java message service',
+ 'db': 'database',
+ 'soa': 'service oriented architecture',
+ 'xsl': 'extensible stylesheet language',
+ 'com': 'compopent object model',
+ 'ldap': 'lightweight directory access protocol',
+ 'odm': 'ibm operational decision manager',
+ 'soql': 'salesforce object query language',
+ 'oms': 'order management system',
+ 'cfml': 'coldfusion markup language',
+ 'nas': 'netscape application server',
+ 'sql': 'structured query language',
+ 'bde': 'borland database engine',
+ 'imap': 'internet message access protocol',
+ 'uws': 'ultidev web server',
+ 'birt': 'business intelligence and reporting tools',
+ 'mdw': 'model driven workflow',
+ 'tws': 'tivoli workload scheduler',
+ 'jre': 'java runtime environment',
+ 'wcs': 'websphere commerce suite',
+ 'was': 'websphere application server',
+ 'ssis': 'sql server integration services',
+ 'xhtml': 'extensible hypertext markup language',
+ 'soap': 'simple object access protocol',
+ 'san': 'storage area network',
+ 'elk': 'elastic stack',
+ 'arr': 'application request routing',
+ 'xlst': 'extensible stylesheet language transformations',
+ 'sccm': 'microsoft endpoint configuration manager',
+ 'ejb': 'enterprise java beans',
+ 'css': 'cascading style sheets',
+ 'hpoo': 'hp operations orchestration',
+ 'xml': 'extensible markup language',
+ 'esb': 'enterprise service bus',
+ 'edi': 'electronic data interchange',
+ 'imsva': 'interscan messaging security virtual appliance',
+ 'wtx': 'ibm websphere transformation extender',
+ 'cgi': 'common gateway interface',
+ 'bal': 'ibm basic assembly language',
+ 'issow': 'integrated safe system of work',
+ 'dcl': 'data control language',
+ 'jdom': 'java document object model',
+ 'fim': 'microsoft forefront identity manager',
+ 'npl': 'niakwa programming language',
+ 'wf': 'windows workflow foundation',
+ 'lm': 'etap license manager',
+ 'wts': 'windows terminal server',
+ 'asp': 'active server pages',
+ 'jil': 'job information language',
+ 'mvc': 'model view controller',
+ 'rmi': 'remote method invocation',
+ 'ad': 'active directory',
+ 'owb': 'oracle warehouse builder',
+ 'rest': 'representational state transfer',
+ 'jdk': 'java development kit',
+ 'ids': 'integrated data store',
+ 'bms': 'batch management software',
+ 'vsx': 'vmware solution exchange',
+ 'ssas': 'sql server analysis services',
+ 'atl': 'atlas transformation language',
+ 'ice': 'infobright community edition',
+ 'esql': 'extended structured query language',
+ 'corba': 'common object request broker architecture',
+ 'dpe': 'device provisioning engines',
+ 'rac': 'oracle real application clusters',
+ 'iemt': 'iis easy migration tool',
+ 'mes': 'manufacturing execution system',
+ 'odbc': 'open database connectivity',
+ 'lms': 'lan management solution',
+ 'wcf': 'windows communication foundation',
+ 'nes': 'netscape enterprise server',
+ 'jsf': 'javaserver faces',
+ 'alm': 'application lifecycle management',
+ 'hlasm': 'high level assembler',
+ 'cmod': 'content manager ondemand'}
+
+external_source = {
+ 'vb.net': 'visual basic dot net',
+ 'jes': 'job entry subsystem',
+ 'svn': 'subversion',
+ 'vcs': 'version control system',
+ 'lims': 'laboratory information management system',
+ 'ide': 'integrated development environment',
+ 'sdk': 'software development kit',
+ 'mq': 'message queue',
+ 'ims': 'information management system',
+ 'isa': 'internet security and acceleration',
+ 'vs': 'visual studio',
+ 'esr': 'extended support release',
+ 'ff': 'firefox',
+ 'vb': 'visual basic',
+ 'rhel': 'red hat enterprise linux',
+ 'iis': 'internet information server',
+ 'api': 'application programming interface',
+ 'se': 'standard edition',
+ '\.net': 'dot net',
+ 'c#': 'c sharp'
+}
+
+
+# synonyms = {
+#  'windows server': 'windows nt',
+#  'windows 7': 'windows desktop',
+#  'windows 8': 'windows desktop',
+#  'windows 10': 'windows desktop'
+# }
+
+
+# add more information
+acronym_mapping.update(external_source)
+
+
+abbrev_to_term = {f'\b{key}\b': value for key, value in acronym_mapping.items()}
+term_to_abbrev = {f'\b{value}\b': key for key, value in acronym_mapping.items()}
+
+def replace_terms_with_abbreviations(text):
+    for input, replacement in term_to_abbrev.items():
+        text = re.sub(input, replacement, text)
+    return text
+
+def replace_abbreviations_with_terms(text):
+    for input, replacement in abbrev_to_term.items():
+        text = re.sub(input, replacement, text)
+    return text
+
+######################################
+
+# augmentation by text corruption
+
+def corrupt_word(word):
+    """Corrupt a single word using random corruption techniques."""
+    if len(word) <= 1:  # Skip corruption for single-character words
+        return word
+    
+    corruption_type = random.choice(["delete", "swap"])
+    
+    if corruption_type == "delete":
+        # Randomly delete a character
+        idx = random.randint(0, len(word) - 1)
+        word = word[:idx] + word[idx + 1:]
+    
+    elif corruption_type == "swap":
+        # Swap two adjacent characters
+        if len(word) > 1:
+            idx = random.randint(0, len(word) - 2)
+            word = (word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:])
+    
+    
+    return word
+
+def corrupt_string(sentence, corruption_probability=0.01):
+    """Corrupt each word in the string with a given probability."""
+    words = sentence.split()
+    corrupted_words = [
+        corrupt_word(word) if random.random() < corruption_probability else word
+        for word in words
+    ]
+    return " ".join(corrupted_words)
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+label_flag_list = []
+
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'labels': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+        # short sequences are rare, and we must compensate by including more examples
+        # mutation of other longer sequences might drown out rare short sequences
+        words = parent_desc.split()
+        word_count = len(words)
+        if word_count < 3:
+            for _ in range(10):
+                element = {
+                    'text': parent_desc,
+                    'label': label2id[index],
+                }
+                output_list.append(element)
+
+
+        # check if label is in label_flag_list
+        if index not in label_flag_list:
+
+            entity_name = row['entity_name']
+            # add the "entity_name" label as a mention
+            element = {
+                'text': entity_name,
+                'labels': label2id[index],
+            }
+            output_list.append(element)
+
+            # remove all non-alphanumerics
+            desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'labels': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+
+            # add shufles of the original entity name
+            no_of_shuffles = SHUFFLES
+            processed_descs = shuffle_text(entity_name, n_shuffles=no_of_shuffles)
+            for desc in processed_descs:
+                if (desc != parent_desc):
+                    element = {
+                        'text' : desc,
+                        'labels': label2id[index], # ensure labels starts from 0
+                    }
+                    output_list.append(element)
+
+            label_flag_list.append(index)
+
+
+
+        # add shuffled strings
+        processed_descs = shuffle_text(parent_desc, n_shuffles=SHUFFLES)
+        for desc in processed_descs:
+            if (desc != parent_desc):
+                element = {
+                    'text' : desc,
+                    'labels': label2id[index], # ensure labels starts from 0
+                }
+                output_list.append(element)
+
+        # corrupt string
+        desc = corrupt_string(parent_desc, corruption_probability=0.1)
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'labels': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+        
+        # augmentation
+        # remove all non-alphanumerics
+        desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        if (desc != parent_desc):
+            element = {
+                'text' : desc,
+                'labels': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+
+        # # augmentation
+        # # perform abbrev_to_term
+        # temp_desc = re.sub(r'[^\w\s]', ' ', parent_desc)  # Retains only alphanumeric and spaces
+        # desc = replace_terms_with_abbreviations(temp_desc)
+        # if (desc != temp_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+        # # augmentation
+        # # perform term to abbrev
+        # desc = replace_abbreviations_with_terms(parent_desc)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../esAppMod_data_import/train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/esAppMod_train/class_bert_hierarchical/.gitignore
+++ b/esAppMod_train/class_bert_hierarchical/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/esAppMod_train/class_bert_hierarchical/prediction/.gitignore
+++ b/esAppMod_train/class_bert_hierarchical/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/esAppMod_train/class_bert_hierarchical/prediction/output.txt
+++ b/esAppMod_train/class_bert_hierarchical/prediction/output.txt
--- a/esAppMod_train/class_bert_hierarchical/prediction/output_1.txt
+++ b/esAppMod_train/class_bert_hierarchical/prediction/output_1.txt
--- a/esAppMod_train/class_bert_hierarchical/prediction/output_2.txt
+++ b/esAppMod_train/class_bert_hierarchical/prediction/output_2.txt
--- a/esAppMod_train/class_bert_hierarchical/prediction/predict_1.py
+++ b/esAppMod_train/class_bert_hierarchical/prediction/predict_1.py
--- a/esAppMod_train/class_bert_hierarchical/prediction/predict_2.py
+++ b/esAppMod_train/class_bert_hierarchical/prediction/predict_2.py
--- a/esAppMod_train/class_bert_hierarchical/train_1.py
+++ b/esAppMod_train/class_bert_hierarchical/train_1.py
--- a/esAppMod_train/class_bert_hierarchical/train_2.py
+++ b/esAppMod_train/class_bert_hierarchical/train_2.py
--- a/esAppMod_train/class_bert_simple/.gitignore
+++ b/esAppMod_train/class_bert_simple/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/esAppMod_train/class_bert_simple/classification_prediction/.gitignore
+++ b/esAppMod_train/class_bert_simple/classification_prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/esAppMod_train/class_bert_simple/classification_prediction/output.txt
+++ b/esAppMod_train/class_bert_simple/classification_prediction/output.txt
--- a/esAppMod_train/class_bert_simple/classification_prediction/predict.py
+++ b/esAppMod_train/class_bert_simple/classification_prediction/predict.py
--- a/esAppMod_train/class_bert_simple/train.py
+++ b/esAppMod_train/class_bert_simple/train.py
--- a/esAppMod_train/golden_sample/.gitignore
+++ b/esAppMod_train/golden_sample/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/esAppMod_train/golden_sample/prediction/.gitignore
+++ b/esAppMod_train/golden_sample/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/esAppMod_train/golden_sample/prediction/output.txt
+++ b/esAppMod_train/golden_sample/prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.80689
+F1 Score: 0.82527
+Precision: 0.89684
+Recall: 0.80689
--- a/esAppMod_train/golden_sample/prediction/predict.py
+++ b/esAppMod_train/golden_sample/prediction/predict.py
@ -0,0 +1,264 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+# data_path = '../../../esAppMod_data_import/train.csv'
+data_path = '../../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        index = row['entity_id']
+        element = {
+            'text' : desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../esAppMod_data_import/test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/train/class_bert_augmentation/train.py
+++ b/train/class_bert_augmentation/train.py
@ -45,7 +45,7 @@ def set_seed(seed):

 set_seed(42)

-SHUFFLES=10
+SHUFFLES=5

 # %%

@ -411,15 +411,15 @@ def process_df_to_dict(df):
        #     }
        #     output_list.append(element)

-        # augmentation
-        # perform term to abbrev
-        desc = replace_abbreviations_with_terms(parent_desc)
-        if (desc != parent_desc):
-            element = {
-                'text' : desc,
-                'label': label2id[index], # ensure labels starts from 0
-            }
-            output_list.append(element)
+        # # augmentation
+        # # perform term to abbrev
+        # desc = replace_abbreviations_with_terms(parent_desc)
+        # if (desc != parent_desc):
+        #     element = {
+        #         'text' : desc,
+        #         'label': label2id[index], # ensure labels starts from 0
+        #     }
+        #     output_list.append(element)


    return output_list
--- a/esAppMod_train/seq2seq_t5_simple/.gitignore
+++ b/esAppMod_train/seq2seq_t5_simple/.gitignore
--- a/esAppMod_train/seq2seq_t5_simple/prediction/.gitignore
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/.gitignore
--- a/esAppMod_train/seq2seq_t5_simple/prediction/inference.py
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/inference.py
--- a/esAppMod_train/seq2seq_t5_simple/prediction/output.txt
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/output.txt
--- a/esAppMod_train/seq2seq_t5_simple/prediction/predict.py
+++ b/esAppMod_train/seq2seq_t5_simple/prediction/predict.py
--- a/esAppMod_train/seq2seq_t5_simple/train.py
+++ b/esAppMod_train/seq2seq_t5_simple/train.py
--- a/esAppMod_train/simple/.gitignore
+++ b/esAppMod_train/simple/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/esAppMod_train/simple/dynamic_train.py
+++ b/esAppMod_train/simple/dynamic_train.py
@ -0,0 +1,273 @@
+# %%
+from torch.utils.data import Dataset, DataLoader
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments,
+    TrainerCallback
+)
+import evaluate
+import numpy as np
+import pandas as pd
+from functools import partial
+import warnings
+
+warnings.filterwarnings("ignore", message='Was asked to gather along dimension 0')
+warnings.filterwarnings("ignore", message='FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated.')
+
+# import matplotlib.pyplot as plt
+
+
+
+torch.set_float32_matmul_precision('high')
+
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+# %%
+# PARAMETERS
+SAMPLES=20
+
+# %%
+###################################################
+# import code
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+df["training_id"] = df["entity_id"].map(label2id)
+
+###############################################################
+# regeneration code
+# %%
+# we want to sample n samples from each class
+# sample_size refers to the number of samples per class
+def sample_from_df(df, sample_size_per_class=5):
+    sampled_df = (df.groupby( "training_id")[['training_id', 'mention']] # explicit give column names
+    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
+    .reset_index(drop=True))
+
+    return sampled_df
+
+
+# %%
+# augment whole dataset
+# for now, we just return the same df
+def augment_data(df):
+    return df
+
+# %%
+class DynamicDataset(Dataset):
+    def __init__(self, df, sample_size_per_class, tokenizer):
+        """
+        Args:
+            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
+            sample_size_per_class (int): Number of samples to draw per class for each epoch.
+        """
+        self.df = df
+        self.sample_size_per_class = sample_size_per_class
+        self.tokenizer = tokenizer
+        self.current_data = None
+        self.regenerate_data()  # Generate the initial dataset
+
+    def regenerate_data(self):
+        """
+        Generate a new sampled dataset for the current epoch.
+
+        dynamic callback function to regenerate data each time we call this
+        method, it updates the current_data we can: 
+            
+        - re-sample the dataframe for a new set of n_samples 
+        - generate fresh augmentations this effectively
+
+        This allows us to re-sample and re-augment at the start of each epoch
+        """
+        # Sample `sample_size_per_class` rows per class
+        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
+        
+        # perform future edits here
+        sampled_df = augment_data(sampled_df)
+
+        # perform tokenization here
+        # Batch tokenize the entire column of data
+        tokenized_batch = self.tokenizer(
+            sampled_df["mention"].to_list(),  # Pass all text data at once
+            truncation=True,
+            # return_tensors="pt"  # disabled because pt requires equal length tensors
+        )
+
+        # Store the tokenized data with labels
+        self.current_data = [
+            {
+                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
+                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
+                "labels": torch.tensor(sampled_df.iloc[i]["training_id"])  # Include the label
+            }
+            for i in range(len(sampled_df))
+        ]
+
+
+    def __len__(self):
+        return len(self.current_data)
+
+    def __getitem__(self, idx):
+        return self.current_data[idx]
+
+# %%
+class RegenerateDatasetCallback(TrainerCallback):
+    def __init__(self, dataset):
+        self.dataset = dataset
+
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        print(f"Epoch {state.epoch + 1}: Regenerating dataset")
+        self.dataset.regenerate_data()
+
+
+
+# %%
+def custom_collate_fn(batch):
+    # Dynamically pad tensors to the longest sequence in the batch
+    input_ids = [item["input_ids"] for item in batch]
+    attention_masks = [item["attention_mask"] for item in batch]
+    labels = torch.stack([item["labels"] for item in batch])
+
+    # Pad inputs to the same length
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
+    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_masks,
+        "labels": labels
+    }
+
+
+##########################################################################
+# training code
+# %%
+def train():
+
+    save_path = f'checkpoint'
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
+
+    # make the dataset
+
+
+    # Define the callback
+    lean_df = df.drop(columns=['entity_name'])
+    dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
+
+    # create the regeneration callback
+    regeneration_callback = RegenerateDatasetCallback(dynamic_dataset)
+
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=120,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=dynamic_dataset,
+        tokenizer=tokenizer,
+        data_collator=custom_collate_fn,
+        compute_metrics=compute_metrics,
+        callbacks=[regeneration_callback]
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/esAppMod_train/simple/prediction/.gitignore
+++ b/esAppMod_train/simple/prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/esAppMod_train/simple/prediction/output.txt
+++ b/esAppMod_train/simple/prediction/output.txt
@ -0,0 +1 @@
+
--- a/esAppMod_train/simple/prediction/predict.py
+++ b/esAppMod_train/simple/prediction/predict.py
@ -0,0 +1,264 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+# construct the target id list
+# data_path = '../../../esAppMod_data_import/train.csv'
+data_path = '../../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # Substitute digits with '#'
+    # text = re.sub(r'\d+', '#', text)
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        index = row['entity_id']
+        element = {
+            'text' : desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../esAppMod_data_import/test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/esAppMod_train/simple/train.py
+++ b/esAppMod_train/simple/train.py
@ -0,0 +1,232 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=5
+
+# %%
+
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.lower()
+
+    # standardize spacing
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        parent_desc = row['mention']
+        parent_desc = preprocess_text(parent_desc)
+
+        # unaugmented data
+        element = {
+            'text' : parent_desc,
+            'labels': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../esAppMod_data_import/train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "distilbert/distilbert-base-uncased"
+    # model_checkpoint = 'google-bert/bert-base-cased'
+    # model_checkpoint = 'prajjwal1/bert-small'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            truncation=True,
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=5e-5,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        warmup_steps=400,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/reference_code/dynamic_dataset_generation.py
+++ b/reference_code/dynamic_dataset_generation.py
@ -0,0 +1,188 @@
+# why?
+# the existing huggingface library does not allow for flexibility in changing
+# the training data between epochs
+
+# this code example illustrates the use of dataset regeneration to make changes
+# to the training data between epochs
+# %%
+from torch.utils.data import Dataset, DataLoader
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+from functools import partial
+# import matplotlib.pyplot as plt
+
+
+
+torch.set_float32_matmul_precision('high')
+
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+# %%
+# PARAMETERS
+SAMPLES=5
+
+# %%
+# import training file
+data_path = '../../esAppMod_data_import/train.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# we want to sample n samples from each class
+# sample_size refers to the number of samples per class
+def sample_from_df(df, sample_size_per_class=5):
+    sampled_df = (df.groupby( "entity_id")[['entity_id', 'mention']] # explicit give column names
+    .apply(lambda x: x.sample(n=min(sample_size_per_class, len(x))))
+    .reset_index(drop=True))
+
+    return sampled_df
+
+
+# %%
+# augment whole dataset
+# for now, we just return the same df
+def augment_data(df):
+    return df
+
+# %%
+class DynamicDataset(Dataset):
+    def __init__(self, df, sample_size_per_class, tokenizer):
+        """
+        Args:
+            df (pd.DataFrame): Original DataFrame with class (id) and data columns.
+            sample_size_per_class (int): Number of samples to draw per class for each epoch.
+        """
+        self.df = df
+        self.sample_size_per_class = sample_size_per_class
+        self.tokenizer = tokenizer
+        self.current_data = None
+        self.regenerate_data()  # Generate the initial dataset
+
+    def regenerate_data(self):
+        """
+        Generate a new sampled dataset for the current epoch.
+
+        dynamic callback function to regenerate data each time we call this
+        method, it updates the current_data we can: 
+            
+        - re-sample the dataframe for a new set of n_samples 
+        - generate fresh augmentations this effectively
+
+        This allows us to re-sample and re-augment at the start of each epoch
+        """
+        # Sample `sample_size_per_class` rows per class
+        sampled_df = sample_from_df(self.df, self.sample_size_per_class)
+        
+        # perform future augmentations here
+        sampled_df = augment_data(sampled_df)
+
+        # perform tokenization here
+        # Batch tokenize the entire column of data
+        tokenized_batch = self.tokenizer(
+            sampled_df["mention"].to_list(),  # Pass all text data at once
+            truncation=True,
+            # return_tensors="pt"  # disabled because pt requires equal length tensors
+        )
+
+        # Store the tokenized data with labels
+        # we need to convert to torch tensors so that subsequent 'pad_sequence'
+        # and 'stack' operations can work
+        self.current_data = [
+            {
+                "input_ids": torch.tensor(tokenized_batch["input_ids"][i]),
+                "attention_mask": torch.tensor(tokenized_batch["attention_mask"][i]),
+                "labels": torch.tensor(sampled_df.iloc[i]["entity_id"])  # Include the label
+            }
+            for i in range(len(sampled_df))
+        ]
+
+
+    def __len__(self):
+        return len(self.current_data)
+
+    def __getitem__(self, idx):
+        return self.current_data[idx]
+
+
+# %%
+# Dynamic dataset
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=False)
+lean_df = df.drop(columns=['entity_name'])
+dynamic_dataset = DynamicDataset(df = lean_df, sample_size_per_class=10, tokenizer=tokenizer)
+
+# %%
+# custom tokenization
+
+# %%
+# Example usage of dynamic dataset
+sample = dynamic_dataset[0]
+print(sample)
+
+
+# %%
+def custom_collate_fn(batch):
+    # Dynamically pad tensors to the longest sequence in the batch
+    input_ids = [item["input_ids"] for item in batch]
+    attention_masks = [item["attention_mask"] for item in batch]
+    labels = torch.stack([item["labels"] for item in batch])
+
+    # Pad inputs to the same length
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
+    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_masks,
+        "labels": labels
+    }
+
+
+dataloader = DataLoader(
+    dynamic_dataset,
+    batch_size=32,
+    collate_fn=custom_collate_fn
+)
+
+# %%