First commit

- added classification-based mapping for esAppMod data
2025-01-13 19:05:13 +09:00 · 2025-01-13 19:05:13 +09:00 · a1d000d9c8
commit a1d000d9c8
29 changed files with 25962 additions and 0 deletions
--- a/analysis/.gitignore
+++ b/analysis/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/analysis/bert_label_clustering.py
+++ b/analysis/bert_label_clustering.py
@ -0,0 +1,80 @@
+# %%
+import json
+import pandas as pd
+from utils import Retriever, cosine_similarity_chunked
+from sklearn.metrics.pairwise import cosine_similarity
+
+##########################################
+# %%
+
+# Load the JSON file
+data_path = '../esAppMod/tca_entities.json'
+with open(data_path, 'r') as file:
+    data = json.load(file)
+
+# Initialize an empty list to store the rows
+rows = []
+
+# %%
+# Loop through all entities in the JSON
+for entity in data["data"].items():
+    entity_data = entity[1]
+    entity_id = entity_data['entity_id']
+    entity_name = entity_data['entity_name']
+    
+    # Add each mention and its entity_id to the rows list
+    rows.append({"id": entity_id, "name": entity_name})
+
+# Create a DataFrame from the rows
+df = pd.DataFrame(rows)
+
+
+# %%
+# df.to_csv('entity.csv', index=False)
+
+
+# %%
+# we want to automatically identify clusters
+class Embedder():
+    input_df: pd.DataFrame
+    fold: int
+
+    def __init__(self, input_df):
+        self.input_df = input_df
+
+
+    def make_embedding(self, checkpoint_path):
+
+        def generate_input_list(df):
+            input_list = []
+            for _, row in df.iterrows():
+                desc = row['name']
+                input_list.append(desc)
+            return input_list
+
+        # prepare reference embed
+        train_data = list(generate_input_list(self.input_df))
+        # Define the directory and the pattern
+        retriever_train = Retriever(train_data, checkpoint_path)
+        retriever_train.make_embedding(batch_size=64)
+        return retriever_train.embeddings.to('cpu')
+
+# model_checkpoint = 'google-bert/bert-base-cased'
+model_checkpoint = '../train/class_bert_simple/checkpoint/checkpoint-4500'
+embedder = Embedder(input_df=df)
+embeddings = embedder.make_embedding(model_checkpoint)
+
+# %%
+similarity_matrix = cosine_similarity(embeddings)
+
+# %%
+similarity_matrix.shape
+
+# %%
+from sklearn.cluster import AgglomerativeClustering
+
+clustering = AgglomerativeClustering(metric='precomputed', linkage='average')
+clustering.fit(1 - similarity_matrix)  # Use distance = 1 - similarity
+
+print(clustering.labels_)  # Cluster assignments
+# %%
--- a/analysis/class_imbalance.py
+++ b/analysis/class_imbalance.py
@ -0,0 +1,17 @@
+# %%
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# %%
+# import training file
+data_path = '../data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+# %%
+id_counts = train_df['entity_id'].value_counts()
+
+# %%
+
+plt.hist(id_counts, bins=50)
+# %%
--- a/analysis/entity_hierarchy.py
+++ b/analysis/entity_hierarchy.py
@ -0,0 +1,95 @@
+# %%
+import json
+import pandas as pd
+
+##########################################
+# %%
+
+# Load the JSON file
+data_path = '../esAppMod/tca_entities.json'
+with open(data_path, 'r') as file:
+    data = json.load(file)
+
+# Initialize an empty list to store the rows
+rows = []
+
+# %%
+# Loop through all entities in the JSON
+for entity in data["data"].items():
+    entity_data = entity[1]
+    entity_id = entity_data['entity_id']
+    entity_name = entity_data['entity_name']
+    entity_type_id = entity_data['entity_type_id']
+    entity_type_name = entity_data['entity_type_name']
+    
+    # Add each mention and its entity_id to the rows list
+    rows.append(
+        {
+        'id': entity_id,
+        'name': entity_name,
+        'type_id': entity_type_id,
+        'type_name': entity_type_name
+        })
+
+# Create a DataFrame from the rows
+df = pd.DataFrame(rows)
+
+# %%
+# df.to_csv('entity.csv', index=False)
+df
+
+# %%
+df['type_name'].value_counts()
+# %%
+df['type_id'].value_counts()
+
+# %%
+name_list = df['name'].to_list()
+# %%
+name_list
+
+# %%
+from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
+import numpy as np
+
+# %%
+# Define labels
+labels = name_list
+
+# Create a prefix-based distance matrix
+def prefix_distance(label1, label2):
+    prefix1 = label1.split()
+    prefix2 = label2.split()
+    # Find common prefix length
+    common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
+    # Distance is inversely proportional to common prefix length
+    return 1.0 / (common_prefix_length + 1)
+
+# Create a pairwise distance matrix
+n = len(labels)
+distance_matrix = np.zeros((n, n))
+for i in range(n):
+    for j in range(n):
+        distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
+
+# Perform hierarchical clustering
+linkage_matrix = linkage(distance_matrix, method='average')
+
+# Visualize as a dendrogram
+import matplotlib.pyplot as plt
+dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
+plt.title("Prefix-Based Clustering")
+plt.show()
+
+# %%
+linkage_matrix
+# %%
+# Extract flat clusters with a distance threshold
+threshold = 0.5
+clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
+
+# Display clusters
+for i, cluster_id in enumerate(clusters):
+    print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
+
+# %%
--- a/analysis/error_analysis.py
+++ b/analysis/error_analysis.py
@ -0,0 +1,71 @@
+# %%
+import pandas as pd
+
+# %%
+# import training file
+data_path = '../data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+# import test file
+data_path = '../data_import/test.csv'
+test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+# import entity file
+data_path = '../data_import/entity.csv'
+entity_df = pd.read_csv(data_path, skipinitialspace=True)
+id2label = {}
+for _, row in entity_df.iterrows():
+    id2label[row['id']] = row['name']
+
+
+
+
+
+# %%
+data_path = '../train/class_bert_process/classification_prediction/exports/result.csv'
+prediction_df = pd.read_csv(data_path)
+
+# %%
+predicted_entity_list = []
+for element in prediction_df['class_prediction']:
+    predicted_entity_list.append(id2label[element])
+
+prediction_df['predicted_name'] = predicted_entity_list
+# %%
+new_df = pd.concat((test_df, prediction_df ), axis=1)
+
+# %%
+mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
+mismatch_df = new_df[mismatch_mask]
+
+
+# %%
+# print the top 10 offending classes
+print(mismatch_df['entity_id'].value_counts()[:10])
+
+# %%
+# Convert the whole dataframe as a string and display
+# print the mismatch_df
+print(mismatch_df.to_markdown())
+
+
+# %%
+# let us see the test mentions
+select_value = 434
+select_mask = mismatch_df['entity_id'] == select_value
+mismatch_df[select_mask]
+
+# %%
+# let us see the train mentions
+select_value = 434
+select_mask = train_df['entity_id'] == select_value
+train_df[select_mask]
+
+
+
+# %%
+mismatch_df[select_mask]['class_prediction'].to_list()
+
+# %%
+# %%
--- a/analysis/utils.py
+++ b/analysis/utils.py
@ -0,0 +1,81 @@
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import torch.nn.functional as F
+
+
+
+class Retriever:
+    def __init__(self, input_texts, model_checkpoint):
+        # we need to generate the embedding from list of input strings
+        self.embeddings = []
+        self.inputs = input_texts
+        model_checkpoint = model_checkpoint 
+        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+        model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # device = "cpu"
+        model.to(self.device)
+        self.model = model.eval()
+
+
+    def make_embedding(self, batch_size=64):
+        all_embeddings = self.embeddings
+        input_texts = self.inputs
+
+        for i in range(0, len(input_texts), batch_size):
+            batch_texts = input_texts[i:i+batch_size]
+            # Tokenize the input text
+            inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
+            input_ids = inputs.input_ids.to(self.device)
+            attention_mask = inputs.attention_mask.to(self.device)
+
+
+            # Pass the input through the encoder and retrieve the embeddings
+            with torch.no_grad():
+                encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+                # get last layer
+                embeddings = encoder_outputs.hidden_states[-1]
+                # get cls token embedding
+                cls_embeddings = embeddings[:, 0, :]  # Shape: (batch_size, hidden_size)
+                all_embeddings.append(cls_embeddings)
+        
+        # remove the batch list and makes a single large tensor, dim=0 increases row-wise
+        all_embeddings = torch.cat(all_embeddings, dim=0)
+
+        self.embeddings = all_embeddings
+
+def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
+    device = 'cuda'
+    batch1_size = batch1.size(0)
+    batch2_size = batch2.size(0)
+    batch2.to(device)
+    
+    # Prepare an empty tensor to store results
+    cos_sim = torch.empty(batch1_size, batch2_size, device=device)
+
+    # Process batch1 in chunks
+    for i in range(0, batch1_size, chunk_size):
+        batch1_chunk = batch1[i:i + chunk_size]  # Get chunk of batch1
+        
+        batch1_chunk.to(device)
+        # Expand batch1 chunk and entire batch2 for comparison
+        # batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
+        # batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        batch2_norms = batch2.norm(dim=1, keepdim=True)
+
+        
+        # Compute cosine similarity for the chunk and store it in the final tensor
+        # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+
+        # Compute cosine similarity by matrix multiplication and normalizing
+        sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
+        
+        # Store the results in the appropriate part of the final tensor
+        cos_sim[i:i + chunk_size] = sim_chunk
+    
+    return cos_sim
--- a/data_import/.gitignore
+++ b/data_import/.gitignore
@ -0,0 +1 @@
+*.csv
--- a/data_import/entity_label.py
+++ b/data_import/entity_label.py
@ -0,0 +1,41 @@
+# %%
+import json
+import pandas as pd
+
+##########################################
+# %%
+
+# Load the JSON file
+data_path = '../esAppMod/tca_entities.json'
+with open(data_path, 'r') as file:
+    data = json.load(file)
+
+# Initialize an empty list to store the rows
+rows = []
+
+# %%
+# Loop through all entities in the JSON
+for entity in data["data"].items():
+    entity_data = entity[1]
+    entity_id = entity_data['entity_id']
+    entity_name = entity_data['entity_name']
+    entity_type_id = entity_data['entity_type_id']
+    entity_type_name = entity_data['entity_type_name']
+    
+    # Add each mention and its entity_id to the rows list
+    rows.append(
+        {
+        'id': entity_id,
+        'name': entity_name,
+        'type_id': entity_type_id,
+        'type_name': entity_type_name
+        })
+
+# Create a DataFrame from the rows
+df = pd.DataFrame(rows)
+
+# %%
+df.to_csv('entity.csv', index=False)
+
+
+# %%
--- a/data_import/process_data.py
+++ b/data_import/process_data.py
@ -0,0 +1,85 @@
+# %%
+import json
+import pandas as pd
+
+##########################################
+# %%
+# import entity information
+
+# %%
+data_path = 'entity.csv'
+entity_df = pd.read_csv(data_path, skipinitialspace=True)
+id2label = {}
+for _, row in entity_df.iterrows():
+    id2label[row['id']] = row['name']
+
+
+# Load the JSON file
+data_path = '../esAppMod/train.json'
+with open(data_path, 'r') as file:
+    data = json.load(file)
+
+# Initialize an empty list to store the rows
+rows = []
+
+# Loop through all entities in the JSON
+for entity_key, entity_data in data["data"].items():
+    mentions = entity_data["mentions"]
+    entity_id = entity_data["entity_id"]
+    entity_name = id2label[entity_id]
+    
+    # Add each mention and its entity_id to the rows list
+    for mention in mentions:
+        rows.append(
+            {
+                "mention": mention,
+                "entity_id": entity_id,
+                "entity_name": entity_name
+            })
+
+# Create a DataFrame from the rows
+train_df = pd.DataFrame(rows)
+
+train_class_set = set(train_df['entity_id'].to_list())
+
+# %%
+train_df.to_csv('train.csv', index=False)
+##########################################
+# %%
+# Load the JSON file
+data_path = '../esAppMod/infer.json'
+with open(data_path, 'r') as file:
+    data = json.load(file)
+
+# Initialize an empty list to store the rows
+rows = []
+
+# Loop through all entities in the JSON
+for entity_key, entity_data in data["data"].items():
+    mention = entity_data["mention"]
+    entity_id = entity_data["entity_id"]
+    entity_name = id2label[entity_id]
+    
+    # Add each mention and its entity_id to the rows list
+    rows.append(
+        {
+            "mention": mention,
+            "entity_id": entity_id,
+            "entity_name": entity_name
+        })
+
+
+
+# Create a DataFrame from the rows
+test_df = pd.DataFrame(rows)
+
+test_class_set = (set(test_df['entity_id'].to_list()))
+
+# %%
+test_df.to_csv('test.csv', index=False)
+
+# %%
+# this shows that the training data can be found in the train set
+test_class_set - train_class_set 
+
+# %%
--- a/esAppMod/infer.json
+++ b/esAppMod/infer.json
--- a/esAppMod/infer_negative.json
+++ b/esAppMod/infer_negative.json
--- a/esAppMod/tca_entities.json
+++ b/esAppMod/tca_entities.json
--- a/esAppMod/train.json
+++ b/esAppMod/train.json
--- a/train/class_bert_process/.gitignore
+++ b/train/class_bert_process/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/class_bert_process/classification_prediction/.gitignore
+++ b/train/class_bert_process/classification_prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/train/class_bert_process/classification_prediction/output.txt
+++ b/train/class_bert_process/classification_prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.79090
+F1 Score: 0.80996
+Precision: 0.88827
+Recall: 0.79090
--- a/train/class_bert_process/classification_prediction/predict.py
+++ b/train/class_bert_process/classification_prediction/predict.py
@ -0,0 +1,262 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+data_path = '../../../data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.upper()
+    
+    # 2. Remove punctuations
+    # text = re.sub(r'[^\w\s]', '', text)  # Retains only alphanumeric and spaces
+
+    # 3. Substitute digits with '#'
+    text = re.sub(r'\d', '#', text)
+    
+    return text
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        index = row['entity_id']
+        element = {
+            'text' : desc,
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../data_import/test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/train/class_bert_process/train.py
+++ b/train/class_bert_process/train.py
@ -0,0 +1,283 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import re
+import random
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+def set_seed(seed):
+    """
+    Set the random seed for reproducibility.
+    """
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # PyTorch GPU
+    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs
+    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility
+
+set_seed(42)
+
+SHUFFLES=5
+
+# %%
+
+# import training file
+data_path = '../../data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+# introduce pre-processing functions
+def preprocess_text(text):
+
+    # 1. Make all uppercase
+    text = text.upper()
+    
+    # 2. Remove punctuations
+    # text = re.sub(r'[^\w\s]', '', text)  # Retains only alphanumeric and spaces
+
+    # 3. Substitute digits with '#'
+    text = re.sub(r'\d', '#', text)
+    
+    return text
+
+
+def generate_random_shuffles(text, n):
+    """
+    Generate n strings with randomly shuffled words from the input text.
+
+    Args:
+        text (str): The input text.
+        n (int): The number of random variations to generate.
+
+    Returns:
+        list: A list of strings with shuffled words.
+    """
+    words = text.split()  # Split the input into words
+    shuffled_variations = []
+    
+    for _ in range(n):
+        shuffled = words[:]  # Copy the word list to avoid in-place modification
+        random.shuffle(shuffled)  # Randomly shuffle the words
+        shuffled_variations.append(" ".join(shuffled))  # Join the words back into a string
+    
+    return shuffled_variations
+
+
+# generate n more shuffled examples
+def shuffle_text(text, n_shuffles=SHUFFLES):
+    """
+    Preprocess a list of texts and add n random shuffles for each string.
+
+    Args:
+        texts (list): An input strings.
+        n_shuffles (int): Number of random shuffles to generate for each string.
+
+    Returns:
+        list: A list of preprocessed and shuffled strings.
+    """
+    all_processed = []
+    all_processed.append(text)
+        
+    # Generate random shuffles
+    shuffled_variations = generate_random_shuffles(text, n_shuffles)
+    all_processed.extend(shuffled_variations)
+    
+    return all_processed
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        # produce shuffling
+        index = row['entity_id']
+        desc = row['mention']
+        desc = preprocess_text(desc)
+        processed_descs = shuffle_text(desc, n_shuffles=SHUFFLES)
+
+        for desc in processed_descs:
+            element = {
+                'text' : desc,
+                'label': label2id[index], # ensure labels starts from 0
+            }
+            output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../data_import/train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    # model_checkpoint = "distilbert/distilbert-base-uncased"
+    model_checkpoint = 'google-bert/bert-base-cased'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<DESC>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-4,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=120,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/train/class_bert_simple/.gitignore
+++ b/train/class_bert_simple/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/class_bert_simple/classification_prediction/.gitignore
+++ b/train/class_bert_simple/classification_prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/train/class_bert_simple/classification_prediction/output.txt
+++ b/train/class_bert_simple/classification_prediction/output.txt
@ -0,0 +1,6 @@
+
+*******************************************************************************
+Accuracy: 0.70070
+F1 Score: 0.73260
+Precision: 0.84815
+Recall: 0.70070
--- a/train/class_bert_simple/classification_prediction/predict.py
+++ b/train/class_bert_simple/classification_prediction/predict.py
@ -0,0 +1,246 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+data_path = '../../../data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+
+# %%
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        index = row['entity_id']
+        element = {
+            'text' : f"{desc}",
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../../data_import/test.csv'
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    # combined_data = DatasetDict({
+    #     'train': Dataset.from_list(process_df_to_dict(train_df)),
+    # })
+    return Dataset.from_list(process_df_to_dict(test_df))
+
+
+
+# %%
+
+def test():
+
+    test_dataset = create_dataset()
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    label_list = [id2label[id] for id in pred_labels]
+    df = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+
+
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+    test()
--- a/train/class_bert_simple/train.py
+++ b/train/class_bert_simple/train.py
@ -0,0 +1,200 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+
+# import training file
+data_path = '../../data_import/train.csv'
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+entity_ids = train_df['entity_id'].to_list()
+target_id_list = sorted(list(set(entity_ids)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(target_id_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        index = row['entity_id']
+        element = {
+            'text' : f"{desc}",
+            'label': label2id[index], # ensure labels starts from 0
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = '../../data_import/train.csv'
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# %%
+
+def train():
+
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    # model_checkpoint = "distilbert/distilbert-base-uncased"
+    model_checkpoint = 'google-bert/bert-base-cased'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<DESC>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(target_id_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-4,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=250,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+
+
+# %%
--- a/train/seq2seq_t5_simple/.gitignore
+++ b/train/seq2seq_t5_simple/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/seq2seq_t5_simple/mapping_prediction/.gitignore
+++ b/train/seq2seq_t5_simple/mapping_prediction/.gitignore
@ -0,0 +1,2 @@
+__pycache__
+exports/
--- a/train/seq2seq_t5_simple/mapping_prediction/inference.py
+++ b/train/seq2seq_t5_simple/mapping_prediction/inference.py
@ -0,0 +1,150 @@
+import torch
+from torch.utils.data import DataLoader
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+)
+import os
+from tqdm import tqdm
+from datasets import Dataset
+import numpy as np
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+class Inference():
+    tokenizer: T5TokenizerFast
+    model: torch.nn.Module
+    dataloader: DataLoader
+
+    def __init__(self, checkpoint_path):
+        self._create_tokenizer()
+        self._load_model(checkpoint_path)
+
+
+    def _create_tokenizer(self):
+        # %%
+        # load tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # Define additional special tokens
+        # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
+        # Add the additional special tokens to the tokenizer
+        # self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    def _load_model(self, checkpoint_path: str):
+        # load model
+        # Define the directory and the pattern
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
+        model = torch.compile(model)
+        # set model to eval
+        self.model = model.eval()
+
+
+
+
+    def prepare_dataloader(self, input_df, batch_size, max_length):
+        """
+        *arguments*
+        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
+        - batch_size: the batch size of dataloader output
+        - max_length: length of tokenizer output
+        """
+        print("preparing dataloader")
+        # convert each dataframe row into a dictionary
+        # outputs a list of dictionaries
+
+        def _process_df(df):
+            output_list = []
+            for _, row in df.iterrows():
+                desc = row['mention']
+                label = row['entity_name']
+                element = {
+                    'input' : desc,
+                    'output': label
+                }
+
+                output_list.append(element)
+
+            return output_list
+
+        def _preprocess_function(example):
+            input = example['input']
+            target = example['output']
+            # text_target sets the corresponding label to inputs
+            # there is no need to create a separate 'labels'
+            model_inputs = self.tokenizer(
+                input,
+                text_target=target, 
+                max_length=max_length,
+                return_tensors="pt",
+                padding='max_length',
+                truncation=True,
+            )
+            return model_inputs
+
+        test_dataset = Dataset.from_list(_process_df(input_df))
+
+
+        # map maps function to each "row" in the dataset
+        # aka the data in the immediate nesting
+        datasets = test_dataset.map(
+            _preprocess_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=test_dataset.column_names,
+        )
+        # datasets = _preprocess_function(test_dataset)
+        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # create dataloader
+        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+
+
+    def generate(self):
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        MAX_GENERATE_LENGTH = 128
+
+        pred_generations = []
+        pred_labels = []
+
+        print("start generation")
+        for batch in tqdm(self.dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            pred_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+            self.model.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                outputs = self.model.generate(input_ids,
+                                        attention_mask=attention_mask,
+                                        max_length=MAX_GENERATE_LENGTH)
+                
+                # Decode the output and print the results
+                pred_generations.extend(outputs.to("cpu"))
+
+
+
+        # %%
+        def process_tensor_output(tokens):
+            predictions =  self.tokenizer.decode(tokens, skip_special_tokens=True)
+            return predictions
+
+        # decode prediction labels
+        def decode_preds(tokens_list):
+            prediction_list = []
+            for tokens in tokens_list:
+                predicted_seq = process_tensor_output(tokens)
+                prediction_list.append(predicted_seq)
+            return prediction_list
+
+        prediction_list = decode_preds(pred_generations)
+        return prediction_list
+
--- a/train/seq2seq_t5_simple/mapping_prediction/output.txt
+++ b/train/seq2seq_t5_simple/mapping_prediction/output.txt
@ -0,0 +1,2 @@
+
+Accuracy for fold: 0.5846658466584665
--- a/train/seq2seq_t5_simple/mapping_prediction/predict.py
+++ b/train/seq2seq_t5_simple/mapping_prediction/predict.py
@ -0,0 +1,62 @@
+
+import pandas as pd
+import os
+import glob
+from inference import Inference
+
+checkpoint_directory =  '../'
+
+BATCH_SIZE = 512
+
+def infer():
+    print(f"Inference for data")
+    # import test data
+    data_path = '../../../data_import/test.csv'
+    df = pd.read_csv(data_path, skipinitialspace=True)
+
+
+    ##########################################
+    # run inference
+    # checkpoint
+    # Use glob to find matching paths
+    directory = os.path.join(checkpoint_directory, f'checkpoint')
+    # Use glob to find matching paths
+    # path is usually checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+
+    infer = Inference(checkpoint_path)
+    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
+    prediction_list = infer.generate()
+
+    # add labels too
+    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
+    # Convert the list to a Pandas DataFrame
+    df_out = pd.DataFrame({
+        'predictions': prediction_list
+    })
+    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
+    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result.csv", index=False)
+
+    # here we want to evaluate mapping accuracy within the valid in mdm data only
+    condition_correct = df['predictions'] == df['entity_name']
+    pred_correct_proportion = sum(condition_correct)/len(df)
+
+    # write output to file output.txt
+    with open("output.txt", "a") as f:
+        print(f'Accuracy for fold: {pred_correct_proportion}', file=f)
+
+###########################################  
+# execute
+
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+infer()
--- a/train/seq2seq_t5_simple/train.py
+++ b/train/seq2seq_t5_simple/train.py
@ -0,0 +1,190 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainer,
+    EarlyStoppingCallback,
+    Seq2SeqTrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+# outputs a list of dictionaries
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = row['mention']
+        label = row['entity_name']
+        element = {
+            'input' : desc,
+            'output': label
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset():
+    # train 
+    data_path = f"../../data_import/train.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+    })
+    return combined_data
+
+
+# function to perform training for a given fold
+def train():
+    save_path = f'checkpoint'
+    split_datasets = create_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "t5-small"
+    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    # additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    # tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['input']
+        target = example['output']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            text_target=target, 
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns=split_datasets["train"].column_names,
+    )
+
+    # https://github.com/huggingface/transformers/pull/28414
+    # model_checkpoint = "google/t5-efficient-tiny"
+    # device_map set to auto to force it to load contiguous weights 
+    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
+    metric = evaluate.load("sacrebleu")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # In case the model returns more than the prediction logits
+        if isinstance(preds, tuple):
+            preds = preds[0]
+
+        decoded_preds = tokenizer.batch_decode(preds, 
+                                            skip_special_tokens=False)
+
+        # Replace -100s in the labels as we can't decode them
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels,
+                                                skip_special_tokens=False)
+
+        # Remove <PAD> tokens from decoded predictions and labels
+        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
+        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
+
+        # Some simple post-processing
+        # decoded_preds = [pred.strip() for pred in decoded_preds]
+        # decoded_labels = [[label.strip()] for label in decoded_labels]
+        # print(decoded_preds, decoded_labels)
+
+        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+        return {"bleu": result["score"]}
+
+
+    # Generation Config
+    # from transformers import GenerationConfig
+    gen_config = model.generation_config
+    gen_config.max_length = 64
+
+    # compile
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # Trainer
+
+    args = Seq2SeqTrainingArguments(
+        f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=80,
+        predict_with_generate=True,
+        bf16=True,
+        push_to_hub=False,
+        generation_config=gen_config,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Seq2SeqTrainer(
+        model,
+        args,
+        train_dataset=tokenized_datasets["train"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+