From 16374b9ab8d549ae54986ca9592677846df59be5 Mon Sep 17 00:00:00 2001
From: Richard Wong <richard@richardwong.io>
Date: Thu, 31 Oct 2024 15:58:20 +0900
Subject: [PATCH] Feat: added train and test directories

---
 data_preprocess/split_data.py |  13 ++-
 test/mapping/.gitignore       |   1 +
 test/mapping/inference.py     | 162 ++++++++++++++++++++++++++++
 test/mapping/output.txt       |   6 ++
 test/mapping/predict.py       |  71 +++++++++++++
 test/selection/.gitignore     |   1 +
 test/selection/inference.py   | 164 ++++++++++++++++++++++++++++
 test/selection/output.txt     |  56 ++++++++++
 test/selection/predict.py     | 129 ++++++++++++++++++++++
 test/selection/selection.py   | 187 ++++++++++++++++++++++++++++++++
 test/selection/utils.py       |  76 +++++++++++++
 train/baseline/.gitignore     |   2 +
 train/baseline/train.py       | 195 ++++++++++++++++++++++++++++++++++
 13 files changed, 1059 insertions(+), 4 deletions(-)
 create mode 100644 test/mapping/.gitignore
 create mode 100644 test/mapping/inference.py
 create mode 100644 test/mapping/output.txt
 create mode 100644 test/mapping/predict.py
 create mode 100644 test/selection/.gitignore
 create mode 100644 test/selection/inference.py
 create mode 100644 test/selection/output.txt
 create mode 100644 test/selection/predict.py
 create mode 100644 test/selection/selection.py
 create mode 100644 test/selection/utils.py
 create mode 100644 train/baseline/.gitignore
 create mode 100644 train/baseline/train.py

diff --git a/data_preprocess/split_data.py b/data_preprocess/split_data.py
index b9aa141..50576e7 100644
--- a/data_preprocess/split_data.py
+++ b/data_preprocess/split_data.py
@@ -158,7 +158,7 @@ data_file_path = 'exports/preprocessed_data.csv'
 data = pd.read_csv(data_file_path)
 
 # Filter the data where MDM is True
-mdm_true = data[data['MDM'] == True].copy()  # .copy()를 사용하여 명시적으로 복사본 생성
+mdm_true = data[data['MDM']].copy()  # .copy()를 사용하여 명시적으로 복사본 생성
 mdm_all = data.copy()
 
 # Create a new column combining 'thing' and 'property'
@@ -340,7 +340,7 @@ def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_s
         
         # Create the test dataset by including only group i
         test_group_ships = groups[i]
-        test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]
+        # test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]
         
         # Extract corresponding entries from the external test dataset
         test_all_data = data[data['ships_idx'].isin(test_group_ships)]
@@ -363,16 +363,21 @@ def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_s
         train_all_data = pd.concat([final_train_data, valid_data])
         
         # Save datasets to CSV files
+        # train.csv: mdm training set
+        # valid.csv: mdm validation set
+        # test.csv: mdm test set
+        # test_all.csv: all test set with non-mdm
+        # train_all.csv: all train set with non-mdm
         train_file_path = os.path.join(group_folder, 'train.csv')
         valid_file_path = os.path.join(group_folder, 'valid.csv')
-        test_file_path = os.path.join(group_folder, 'test.csv')
+        # test_file_path = os.path.join(group_folder, 'test.csv')
         test_all_file_path = os.path.join(group_folder, 'test_all.csv')
         train_all_file_path = os.path.join(group_folder, 'train_all.csv')
         
         final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')
         valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')
         # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')
-        test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')
+        test_all_data.to_csv(test_all_file_path, index=False, encoding='utf-8-sig')
         train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')
         
         print(f"Group {i + 1} datasets saved in {group_folder}")
diff --git a/test/mapping/.gitignore b/test/mapping/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/test/mapping/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/test/mapping/inference.py b/test/mapping/inference.py
new file mode 100644
index 0000000..2232be0
--- /dev/null
+++ b/test/mapping/inference.py
@@ -0,0 +1,162 @@
+import torch
+from torch.utils.data import DataLoader
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+)
+import os
+from tqdm import tqdm
+from datasets import Dataset
+import numpy as np
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+class Inference():
+    tokenizer: T5TokenizerFast
+    model: torch.nn.Module
+    dataloader: DataLoader
+
+    def __init__(self, checkpoint_path):
+        self._create_tokenizer()
+        self._load_model(checkpoint_path)
+
+
+    def _create_tokenizer(self):
+        # %%
+        # load tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # Define additional special tokens
+        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
+        # Add the additional special tokens to the tokenizer
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    def _load_model(self, checkpoint_path: str):
+        # load model
+        # Define the directory and the pattern
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
+        model = torch.compile(model)
+        # set model to eval
+        self.model = model.eval()
+
+
+
+
+    def prepare_dataloader(self, input_df, batch_size, max_length):
+        """
+        *arguments*
+        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
+        - batch_size: the batch size of dataloader output
+        - max_length: length of tokenizer output
+        """
+        print("preparing dataloader")
+        # convert each dataframe row into a dictionary
+        # outputs a list of dictionaries
+        def _process_df(df):
+            output_list = [{
+                    'input': f"<DESC>{row['tag_description']}<DESC>",
+                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+            } for _, row in df.iterrows()]
+
+            return output_list
+
+        def _preprocess_function(example):
+            input = example['input']
+            target = example['output']
+            # text_target sets the corresponding label to inputs
+            # there is no need to create a separate 'labels'
+            model_inputs = self.tokenizer(
+                input,
+                text_target=target, 
+                max_length=max_length,
+                return_tensors="pt",
+                padding='max_length',
+                truncation=True,
+            )
+            return model_inputs
+
+        test_dataset = Dataset.from_list(_process_df(input_df))
+
+
+        # map maps function to each "row" in the dataset
+        # aka the data in the immediate nesting
+        datasets = test_dataset.map(
+            _preprocess_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=test_dataset.column_names,
+        )
+        # datasets = _preprocess_function(test_dataset)
+        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # create dataloader
+        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+
+
+    def generate(self):
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        MAX_GENERATE_LENGTH = 128
+
+        pred_generations = []
+        pred_labels = []
+
+        print("start generation")
+        for batch in tqdm(self.dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            pred_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+            self.model.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                outputs = self.model.generate(input_ids,
+                                        attention_mask=attention_mask,
+                                        max_length=MAX_GENERATE_LENGTH)
+                
+                # Decode the output and print the results
+                pred_generations.extend(outputs.to("cpu"))
+
+
+
+        # %%
+        # extract sequence and decode
+        def extract_seq(tokens, start_value, end_value):
+            if start_value not in tokens or end_value not in tokens:
+                return None  # Or handle this case according to your requirements
+            start_id = np.where(tokens == start_value)[0][0]
+            end_id = np.where(tokens == end_value)[0][0]
+
+            return tokens[start_id+1:end_id]
+
+
+        def process_tensor_output(tokens):
+            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
+            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
+            p_thing = None
+            p_property = None
+            if (thing_seq is not None):
+                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
+            if (property_seq is not None):
+                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
+            return p_thing, p_property
+
+        # decode prediction labels
+        def decode_preds(tokens_list):
+            thing_prediction_list = []
+            property_prediction_list = []
+            for tokens in tokens_list:
+                p_thing, p_property = process_tensor_output(tokens)
+                thing_prediction_list.append(p_thing)
+                property_prediction_list.append(p_property)
+            return thing_prediction_list, property_prediction_list 
+
+        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
+        return thing_prediction_list, property_prediction_list
+
diff --git a/test/mapping/output.txt b/test/mapping/output.txt
new file mode 100644
index 0000000..da49e44
--- /dev/null
+++ b/test/mapping/output.txt
@@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9171793658305727
+Accuracy for fold 2: 0.9051401869158878
+Accuracy for fold 3: 0.9688755020080321
+Accuracy for fold 4: 0.9624167459562322
+Accuracy for fold 5: 0.8896014658726523
diff --git a/test/mapping/predict.py b/test/mapping/predict.py
new file mode 100644
index 0000000..bf87cb0
--- /dev/null
+++ b/test/mapping/predict.py
@@ -0,0 +1,71 @@
+
+import pandas as pd
+import os
+import glob
+from inference import Inference
+
+checkpoint_directory =  '../../train/baseline'
+
+def infer_and_select(fold):
+    print(f"Inference for fold {fold}")
+    # import test data
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # get target data
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+    # processing to help with selection later
+    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
+
+
+    ##########################################
+    # run inference
+    # checkpoint
+    # Use glob to find matching paths
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+
+    infer = Inference(checkpoint_path)
+    infer.prepare_dataloader(df, batch_size=256, max_length=128)
+    thing_prediction_list, property_prediction_list = infer.generate()
+
+    # add labels too
+    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
+    # Convert the list to a Pandas DataFrame
+    df_out = pd.DataFrame({
+        'p_thing': thing_prediction_list, 
+        'p_property': property_prediction_list
+    })
+    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
+    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")
+
+    # here we want to evaluate mapping accuracy within the valid in mdm data only
+    in_mdm = df['MDM']
+    condition_correct_thing = df['p_thing'] == df['thing']
+    condition_correct_property = df['p_property'] == df['property']
+    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm)
+    pred_correct_proportion = prediction_mdm_correct/sum(in_mdm)
+
+    # write output to file output.txt
+    with open("output.txt", "a") as f:
+        print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f)
+
+###########################################  
+# Execute for all folds
+
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+for fold in [1,2,3,4,5]:
+    infer_and_select(fold)
diff --git a/test/selection/.gitignore b/test/selection/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/test/selection/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/test/selection/inference.py b/test/selection/inference.py
new file mode 100644
index 0000000..896cb72
--- /dev/null
+++ b/test/selection/inference.py
@@ -0,0 +1,164 @@
+import torch
+from torch.utils.data import DataLoader
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+)
+import glob
+import os
+import pandas as pd
+from tqdm import tqdm
+from datasets import Dataset
+import numpy as np
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+class Inference():
+    tokenizer: T5TokenizerFast
+    model: torch.nn.Module
+    dataloader: DataLoader
+
+    def __init__(self, checkpoint_path):
+        self._create_tokenizer()
+        self._load_model(checkpoint_path)
+
+
+    def _create_tokenizer(self):
+        # %%
+        # load tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # Define additional special tokens
+        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
+        # Add the additional special tokens to the tokenizer
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    def _load_model(self, checkpoint_path: str):
+        # load model
+        # Define the directory and the pattern
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
+        model = torch.compile(model)
+        # set model to eval
+        self.model = model.eval()
+
+
+
+
+    def prepare_dataloader(self, input_df, batch_size, max_length):
+        """
+        *arguments*
+        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
+        - batch_size: the batch size of dataloader output
+        - max_length: length of tokenizer output
+        """
+        print("preparing dataloader")
+        # convert each dataframe row into a dictionary
+        # outputs a list of dictionaries
+        def _process_df(df):
+            output_list = [{
+                    'input': f"<DESC>{row['tag_description']}<DESC>",
+                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+            } for _, row in df.iterrows()]
+
+            return output_list
+
+        def _preprocess_function(example):
+            input = example['input']
+            target = example['output']
+            # text_target sets the corresponding label to inputs
+            # there is no need to create a separate 'labels'
+            model_inputs = self.tokenizer(
+                input,
+                text_target=target, 
+                max_length=max_length,
+                return_tensors="pt",
+                padding='max_length',
+                truncation=True,
+            )
+            return model_inputs
+
+        test_dataset = Dataset.from_list(_process_df(input_df))
+
+
+        # map maps function to each "row" in the dataset
+        # aka the data in the immediate nesting
+        datasets = test_dataset.map(
+            _preprocess_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=test_dataset.column_names,
+        )
+        # datasets = _preprocess_function(test_dataset)
+        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # create dataloader
+        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+
+
+    def generate(self):
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        MAX_GENERATE_LENGTH = 128
+
+        pred_generations = []
+        pred_labels = []
+
+        print("start generation")
+        for batch in tqdm(self.dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            pred_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+            self.model.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                outputs = self.model.generate(input_ids,
+                                        attention_mask=attention_mask,
+                                        max_length=MAX_GENERATE_LENGTH)
+                
+                # Decode the output and print the results
+                pred_generations.extend(outputs.to("cpu"))
+
+
+
+        # %%
+        # extract sequence and decode
+        def extract_seq(tokens, start_value, end_value):
+            if start_value not in tokens or end_value not in tokens:
+                return None  # Or handle this case according to your requirements
+            start_id = np.where(tokens == start_value)[0][0]
+            end_id = np.where(tokens == end_value)[0][0]
+
+            return tokens[start_id+1:end_id]
+
+
+        def process_tensor_output(tokens):
+            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
+            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
+            p_thing = None
+            p_property = None
+            if (thing_seq is not None):
+                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
+            if (property_seq is not None):
+                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
+            return p_thing, p_property
+
+        # decode prediction labels
+        def decode_preds(tokens_list):
+            thing_prediction_list = []
+            property_prediction_list = []
+            for tokens in tokens_list:
+                p_thing, p_property = process_tensor_output(tokens)
+                thing_prediction_list.append(p_thing)
+                property_prediction_list.append(p_property)
+            return thing_prediction_list, property_prediction_list 
+
+        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
+        return thing_prediction_list, property_prediction_list
+
diff --git a/test/selection/output.txt b/test/selection/output.txt
new file mode 100644
index 0000000..245955f
--- /dev/null
+++ b/test/selection/output.txt
@@ -0,0 +1,56 @@
+
+********************************************************************************
+Statistics for fold 1
+tp: 1792
+tn: 10533
+fp: 428
+fn: 321
+fold: 1
+accuracy:  0.9427107235735047
+f1_score:  0.827140549273021
+precision:  0.8072072072072072
+recall:  0.8480832938949361
+********************************************************************************
+Statistics for fold 2
+tp: 1875
+tn: 8189
+fp: 393
+fn: 265
+fold: 2
+accuracy:  0.9386308524529006
+f1_score:  0.8507259528130672
+precision:  0.8267195767195767
+recall:  0.8761682242990654
+********************************************************************************
+Statistics for fold 3
+tp: 1831
+tn: 7455
+fp: 408
+fn: 161
+fold: 3
+accuracy:  0.9422628107559614
+f1_score:  0.8655164263767431
+precision:  0.8177757927646271
+recall:  0.9191767068273092
+********************************************************************************
+Statistics for fold 4
+tp: 1909
+tn: 12866
+fp: 483
+fn: 193
+fold: 4
+accuracy:  0.9562487864863116
+f1_score:  0.8495772140631954
+precision:  0.7980769230769231
+recall:  0.9081826831588963
+********************************************************************************
+Statistics for fold 5
+tp: 1928
+tn: 10359
+fp: 427
+fn: 255
+fold: 5
+accuracy:  0.9474130619168787
+f1_score:  0.8497135301895108
+precision:  0.818683651804671
+recall:  0.8831882730187814
diff --git a/test/selection/predict.py b/test/selection/predict.py
new file mode 100644
index 0000000..2338cf9
--- /dev/null
+++ b/test/selection/predict.py
@@ -0,0 +1,129 @@
+import pandas as pd
+import os
+import glob
+from inference import Inference
+
+# directory for checkpoints
+checkpoint_directory =  '../../train/baseline'
+
+def infer_and_select(fold):
+    # import test data
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # get target data
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+    # processing to help with selection later
+    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
+
+
+    ##########################################
+    # run inference
+    # checkpoint
+    # Use glob to find matching paths
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+    infer = Inference(checkpoint_path)
+    infer.prepare_dataloader(df, batch_size=256, max_length=64)
+    thing_prediction_list, property_prediction_list = infer.generate()
+
+    # add labels too
+    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
+    # Convert the list to a Pandas DataFrame
+    df_out = pd.DataFrame({
+        'p_thing': thing_prediction_list, 
+        'p_property': property_prediction_list
+    })
+    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
+    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
+    df = pd.concat([df, df_out], axis=1)
+
+    ##########################################
+    # Process the dataframe for selection
+
+    # we start to cull predictions from here
+    data_master_path = f"../../data_import/exports/data_model_master_export.csv"
+    df_master = pd.read_csv(data_master_path, skipinitialspace=True)
+    data_mapping = df
+    # Generate patterns    
+    data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)
+    data_mapping['property_pattern'] = data_mapping['property'].str.replace(r'\d', '#', regex=True)
+    data_mapping['pattern'] = data_mapping['thing_pattern'] + " " + data_mapping['property_pattern']
+    df_master['master_pattern'] = df_master['thing'] + " " + df_master['property']    
+    # Create a set of unique patterns from master for fast lookup    
+    master_patterns = set(df_master['master_pattern'])
+    # thing_patterns = set(df_master['thing'])
+    # Check each pattern in data_mapping if it exists in df_master and assign the "MDM" field    
+    data_mapping['MDM'] = data_mapping['pattern'].apply(lambda x: x in master_patterns)    
+
+    # check if prediction is in MDM
+    data_mapping['p_thing_pattern'] = data_mapping['p_thing'].str.replace(r'\d', '#', regex=True)
+    data_mapping['p_property_pattern'] = data_mapping['p_property'].str.replace(r'\d', '#', regex=True)
+    data_mapping['p_pattern'] = data_mapping['p_thing_pattern'] + " " + data_mapping['p_property_pattern']
+    data_mapping['p_MDM'] = data_mapping['p_pattern'].apply(lambda x: x in master_patterns)    
+
+    df = data_mapping
+
+    # we can save the t5 generation output here
+    # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")
+
+
+
+    condition1 = df['MDM']
+    condition2 = df['p_MDM']
+
+    condition_correct_thing = df['p_thing'] == df['thing']
+    condition_correct_property = df['p_property'] == df['property']
+    match = sum(condition1 & condition2)
+    fn = sum(condition1 & ~condition2)
+    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
+
+    # print("mdm match predicted mdm: ", match)  # 56 - false negative
+    # print("mdm but not predicted mdm: ", fn)  # 56 - false negative
+    # print("total mdm: ", sum(condition1))  # 2113
+    # print("total predicted mdm: ", sum(condition2))  # 6896 - a lot of false positives
+    # print("correct mdm predicted", prediction_mdm_correct)
+
+
+    # selection
+    ###########################################
+    # we now have to perform selection
+    # we restrict to predictions of a class of a ship
+    # then perform similarity selection with in-distribution data
+    # the magic is in performing per-class selection, not global
+    # import importlib
+    import selection
+    # importlib.reload(selection)
+    selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
+    tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)
+
+
+    # write output to file output.txt
+    with open("output.txt", "a") as f:
+        print(80 * '*', file=f)
+        print(f'Statistics for fold {fold}', file=f)
+        print(f"tp: {tp}", file=f)
+        print(f"tn: {tn}", file=f)
+        print(f"fp: {fp}", file=f)
+        print(f"fn: {fn}", file=f)
+        print(f"fold: {fold}", file=f)
+        print("accuracy: ", (tp+tn)/(tp+tn+fp+fn), file=f)
+        print("f1_score: ", (2*tp)/((2*tp) + fp + fn), file=f)
+        print("precision: ", (tp)/(tp+fp), file=f)
+        print("recall: ", (tp)/(tp+fn), file=f)
+
+###########################################  
+# Execute for all folds
+
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+for fold in [1,2,3,4,5]:
+    infer_and_select(fold)
diff --git a/test/selection/selection.py b/test/selection/selection.py
new file mode 100644
index 0000000..fdbf292
--- /dev/null
+++ b/test/selection/selection.py
@@ -0,0 +1,187 @@
+import pandas as pd
+import numpy as np
+from typing import List
+from tqdm import tqdm
+from utils import Retriever, cosine_similarity_chunked
+
+class Selector():
+    input_df: pd.DataFrame
+    reference_df: pd.DataFrame
+    ships_list: List[int]
+    fold: int
+
+    def __init__(self, input_df, reference_df, fold):
+        self.ships_list = sorted(list(set(input_df['ships_idx'])))
+        self.input_df = input_df
+        self.reference_df = reference_df
+        self.fold = fold
+
+
+    def run_selection(self, checkpoint_path):
+
+        def generate_input_list(df):
+            input_list = []
+            for _, row in df.iterrows():
+                # name = f"<NAME>{row['tag_name']}<NAME>"
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                # element = f"{name}{desc}"
+                element = f"{desc}"
+                input_list.append(element)
+            return input_list
+
+        # given a dataframe, return a single idx of the entry has the highest match with
+        # the embedding
+        def selection(cos_sim_matrix, condition_source, condition_target):
+            # subset_matrix = cos_sim_matrix[condition_source]
+            # except we are subsetting 2D matrix (row, column)
+            subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
+            # we select top k here
+            # Get the indices of the top 5 maximum values along axis 1
+            top_k = 1
+            top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
+            
+            # Get the values of the top 5 maximum scores
+            top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
+            
+            # Calculate the average of the top 5 scores along axis 1
+            y_scores = np.mean(top_k_values, axis=1)
+            max_idx = np.argmax(y_scores)
+            max_score = y_scores[max_idx]
+            # convert boolean to indices (1,2,3)
+            condition_indices = np.where(condition_source)[0]
+            max_idx = condition_indices[max_idx]
+
+            return max_idx, max_score
+
+
+        # prepare reference embed
+        train_data = list(generate_input_list(self.reference_df))
+        # Define the directory and the pattern
+        retriever_train = Retriever(train_data, checkpoint_path)
+        retriever_train.make_mean_embedding(batch_size=64)
+        train_embed = retriever_train.embeddings
+
+        # take the inputs for df_sub
+        test_data = list(generate_input_list(self.input_df))
+        retriever_test = Retriever(test_data, checkpoint_path)
+        retriever_test.make_mean_embedding(batch_size=64)
+        test_embed = retriever_test.embeddings
+
+
+
+        # precision_list = []
+        # recall_list = []
+        tp_accumulate = 0
+        tn_accumulate = 0
+        fp_accumulate = 0
+        fn_accumulate = 0
+        THRESHOLD = 0.9
+        for ship_idx in self.ships_list:
+            print(ship_idx)
+            # we select a ship and select only data exhibiting MDM pattern in the predictions
+            ship_mask = (self.input_df['ships_idx'] == ship_idx) & (self.input_df['p_MDM'])
+            df_ship = self.input_df[ship_mask].reset_index(drop=True)
+            # we then try to make a dataframe for each thing_property attribute
+            df_ship['thing_property'] = df_ship['p_thing'] + " " + df_ship['p_property']
+            unique_patterns = list(set(df_ship['thing_property']))
+            condition_list = []
+            for pattern in unique_patterns:
+                # we obtain the boolean mask to subset the source and target entries
+                condition_source = (df_ship['thing_property'] == pattern)
+                condition_target = (self.reference_df['thing_property'] == pattern)
+                item = {'condition_source': condition_source,
+                        'condition_target': condition_target}
+                condition_list.append(item)
+
+            # subset part of self.input_df that belongs to the ship 
+            test_embed_subset = test_embed[ship_mask]
+            cos_sim_matrix = cosine_similarity_chunked(test_embed_subset, train_embed, chunk_size=8).cpu().numpy()
+
+
+            # for each sub_df, we have to select the best candidate
+            # we will do this by finding which desc input has the highest similarity with train data
+            all_idx_list = []
+            selected_idx_list = []
+            similarity_score = []
+            for item in tqdm(condition_list):
+                condition_source = item['condition_source']
+                condition_target = item['condition_target']
+                # if there is no equivalent data in target, we skip
+                if sum(condition_target) == 0:
+                    pass
+                # if there is equivalent data in target, we perform selection among source
+                # by top-k highest similarity with targets
+                else:
+                    # idx is with respect
+                    max_idx, max_score = selection(
+                        cos_sim_matrix, condition_source, condition_target 
+                    )
+                    all_idx_list.append(max_idx)
+                    similarity_score.append(max_score)
+                    # implement thresholding
+                    if max_score > THRESHOLD:
+                        selected_idx_list.append(max_idx)
+
+            # let us tag the df_ship with the respective 'selected' and 'ood' tags
+            df_ship['selected'] = False
+            df_ship.loc[all_idx_list, 'selected'] = True
+            df_ship['ood'] = 0.0
+            df_ship.loc[all_idx_list, 'ood'] = similarity_score
+
+            # we now split the dataframe by p_mdm
+            # explanation:
+            # we first separated our ship into p_mdm and non p_mdm
+            # we only select final in-mdm prediction from p_mdm subset
+            # anything that is not selected and from non-p_mdm is predicted not in mdm
+
+            # get our final prediction
+            df_subset_predicted_true = df_ship.loc[selected_idx_list]
+            # take the set difference between df_ship's index and the given list
+            inverse_list = df_ship.index.difference(selected_idx_list).to_list()
+            df_subset_predicted_false = df_ship.loc[inverse_list]
+
+            not_p_mdm_mask = (self.input_df['ships_idx'] == ship_idx) & (~self.input_df['p_MDM'])
+            # this is the part we don't care
+            df_not_p_mdm = self.input_df[not_p_mdm_mask].reset_index(drop=True)
+
+            # concat
+            df_false = pd.concat([df_subset_predicted_false, df_not_p_mdm], axis=0)
+            assert(len(df_false) + len(df_subset_predicted_true) == sum(self.input_df['ships_idx'] == ship_idx))
+
+            # we want to return a df with the final prediction
+            # a bit dirty, but we re-use the fields
+            df_false['p_MDM'] = False
+            df_subset_predicted_true['p_MDM'] = True
+
+
+            # save ship for analysis later
+            # df_return = pd.concat([df_false, df_subset_predicted_true], axis=0)
+            # df_return.to_parquet(f'exports/fold_{self.fold}/ship_{ship_idx}.parquet')
+
+
+
+
+            # true positive -> predicted in mdm, actual in mdm
+            # we get all the final predictions that are also found in MDM
+            true_positive = sum(df_subset_predicted_true['MDM'])
+            # true negative -> predicted not in mdm, and not found in MDM
+            # we negate the condition to get those that are not found in MDM
+            true_negative = sum(~df_false['MDM'])
+            # false positive -> predicted in mdm, not found in mdm
+            false_positive = sum(~df_subset_predicted_true['MDM'])
+            # false negative -> predicted not in mdm, found in mdm
+            false_negative = sum(df_false['MDM'])
+
+
+            tp_accumulate = tp_accumulate + true_positive
+            tn_accumulate = tn_accumulate + true_negative
+            fp_accumulate = fp_accumulate + false_positive
+            fn_accumulate = fn_accumulate + false_negative
+
+             
+
+        total_sum = (tp_accumulate + tn_accumulate + fp_accumulate + fn_accumulate)
+        # ensure that all entries are accounted for
+        assert(total_sum == len(self.input_df))
+        return tp_accumulate, tn_accumulate, fp_accumulate, fn_accumulate
+
diff --git a/test/selection/utils.py b/test/selection/utils.py
new file mode 100644
index 0000000..b2f2116
--- /dev/null
+++ b/test/selection/utils.py
@@ -0,0 +1,76 @@
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from transformers import AutoModelForSeq2SeqLM
+import torch.nn.functional as F
+
+
+
+class Retriever:
+    def __init__(self, input_texts, model_checkpoint):
+        # we need to generate the embedding from list of input strings
+        self.embeddings = []
+        self.inputs = input_texts
+        model_checkpoint = model_checkpoint 
+        self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # define additional special tokens
+        additional_special_tokens = ["<thing_start>", "<thing_end>", "<property_start>", "<property_end>", "<name>", "<desc>", "<sig>", "<unit>", "<data_type>"]
+        # add the additional special tokens to the tokenizer
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+        self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
+        # device = "cpu"
+        model.to(self.device)
+        self.model = model.eval()
+
+
+
+
+    def make_mean_embedding(self, batch_size=32):
+        all_embeddings = self.embeddings
+        input_texts = self.inputs
+
+        for i in range(0, len(input_texts), batch_size):
+            batch_texts = input_texts[i:i+batch_size]
+            # Tokenize the input text
+            inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
+            input_ids = inputs.input_ids.to(self.device)
+            attention_mask = inputs.attention_mask.to(self.device)
+
+
+            # Pass the input through the encoder and retrieve the embeddings
+            with torch.no_grad():
+                encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask)
+                embeddings = encoder_outputs.last_hidden_state
+
+            # Compute the mean pooling of the token embeddings
+            # mean_embedding = embeddings.mean(dim=1)
+            mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
+            all_embeddings.append(mean_embedding)
+        
+        # remove the batch list and makes a single large tensor, dim=0 increases row-wise
+        all_embeddings = torch.cat(all_embeddings, dim=0)
+
+        self.embeddings = all_embeddings
+
+def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
+    batch1_size = batch1.size(0)
+    batch2_size = batch2.size(0)
+    
+    # Prepare an empty tensor to store results
+    cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
+
+    # Process batch1 in chunks
+    for i in range(0, batch1_size, chunk_size):
+        batch1_chunk = batch1[i:i + chunk_size]  # Get chunk of batch1
+        
+        # Expand batch1 chunk and entire batch2 for comparison
+        batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
+        batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        
+        # Compute cosine similarity for the chunk and store it in the final tensor
+        cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+    
+    return cos_sim
+
diff --git a/train/baseline/.gitignore b/train/baseline/.gitignore
new file mode 100644
index 0000000..2e7f3f7
--- /dev/null
+++ b/train/baseline/.gitignore
@@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log/
diff --git a/train/baseline/train.py b/train/baseline/train.py
new file mode 100644
index 0000000..f79f2fc
--- /dev/null
+++ b/train/baseline/train.py
@@ -0,0 +1,195 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainer,
+    EarlyStoppingCallback,
+    Seq2SeqTrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# outputs a list of dictionaries
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        element = {
+            'input' : f"{desc}",
+            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset(fold):
+    # train 
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # valid
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
+    })
+    return combined_data
+
+
+# function to perform training for a given fold
+def train(fold):
+    save_path = f'checkpoint_fold_{fold}'
+    split_datasets = create_split_dataset(fold)
+
+    # prepare tokenizer
+
+    model_checkpoint = "t5-small"
+    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['input']
+        target = example['output']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            text_target=target, 
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns=split_datasets["train"].column_names,
+    )
+
+    # https://github.com/huggingface/transformers/pull/28414
+    # model_checkpoint = "google/t5-efficient-tiny"
+    # device_map set to auto to force it to load contiguous weights 
+    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
+    metric = evaluate.load("sacrebleu")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # In case the model returns more than the prediction logits
+        if isinstance(preds, tuple):
+            preds = preds[0]
+
+        decoded_preds = tokenizer.batch_decode(preds, 
+                                            skip_special_tokens=False)
+
+        # Replace -100s in the labels as we can't decode them
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels,
+                                                skip_special_tokens=False)
+
+        # Remove <PAD> tokens from decoded predictions and labels
+        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
+        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
+
+        # Some simple post-processing
+        # decoded_preds = [pred.strip() for pred in decoded_preds]
+        # decoded_labels = [[label.strip()] for label in decoded_labels]
+        # print(decoded_preds, decoded_labels)
+
+        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+        return {"bleu": result["score"]}
+
+
+    # Generation Config
+    # from transformers import GenerationConfig
+    gen_config = model.generation_config
+    gen_config.max_length = 64
+
+    # compile
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # Trainer
+
+    args = Seq2SeqTrainingArguments(
+        f"{save_path}",
+        eval_strategy="epoch",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        learning_rate=1e-3,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        predict_with_generate=True,
+        bf16=True,
+        push_to_hub=False,
+        generation_config=gen_config,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Seq2SeqTrainer(
+        model,
+        args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+for fold in [1,2,3,4,5]:
+    print(fold)
+    train(fold)
+