Feat: added more classification and mapping variations

Feat: added grid-search for threshold in similarity-classifier Feat: added more abbreviation rules
2024-11-25 18:15:28 +09:00 · 2024-11-25 18:15:28 +09:00 · ff6e11a3c0
parent 1f3970459f
commit ff6e11a3c0
43 changed files with 2905 additions and 50558 deletions
--- a/analysis/bert/find_closest_mapping_complete.py
+++ b/analysis/bert/find_closest_mapping_complete.py
@ -0,0 +1,293 @@
+
+# %%
+import pandas as pd
+from utils import Retriever, cosine_similarity_chunked
+import os
+import glob
+import numpy as np
+
+# %%
+fold = 5
+data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+# subset to mdm
+df = df[df['MDM']]
+
+# create new fields 'mapping' and 'p_mapping'
+# these are analogous to 'pattern', where we combine 'thing' and 'property' without replacing the numbers
+df['mapping'] = df['thing'] + ' ' + df['property']
+df['p_mapping'] = df['p_thing'] + ' ' + df['p_property']
+
+
+thing_condition = df['p_thing'] == df['thing']
+error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
+
+property_condition = df['p_property'] == df['property']
+error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
+
+correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
+
+test_df = df
+
+# %%
+print(len(error_thing_df))
+print(len(error_property_df))
+
+# %%
+# thing_df.to_html('thing_errors.html')
+# property_df.to_html('property_errors.html')
+
+##########################################
+# what we need now is understand why the model is making these mispredictions
+# import train data and test data
+# %%
+class Embedder():
+    input_df: pd.DataFrame
+    fold: int
+
+    def __init__(self, input_df):
+        self.input_df = input_df
+
+
+    def make_embedding(self, checkpoint_path):
+
+        def generate_input_list(df):
+            input_list = []
+            for _, row in df.iterrows():
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                element = f"{desc}{unit}"
+                input_list.append(element)
+            return input_list
+
+        # prepare reference embed
+        train_data = list(generate_input_list(self.input_df))
+        # Define the directory and the pattern
+        retriever_train = Retriever(train_data, checkpoint_path)
+        retriever_train.make_embedding(batch_size=64)
+        return retriever_train.embeddings.to('cpu')
+
+
+
+# %%
+data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+train_df['mapping'] = train_df['thing'] + ' ' + train_df['property']
+
+checkpoint_directory = "../../train/classification_bert_complete_desc_unit"
+directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+# Use glob to find matching paths
+# path is usually checkpoint_fold_1/checkpoint-<step number>
+# we are guaranteed to save only 1 checkpoint from training
+pattern = 'checkpoint-*'
+checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+train_embedder = Embedder(input_df=train_df)
+train_embeds = train_embedder.make_embedding(checkpoint_path)
+
+test_embedder = Embedder(input_df=test_df)
+test_embeds = test_embedder.make_embedding(checkpoint_path)
+
+
+
+# %%
+# test embeds are inputs since we are looking back at train data
+cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
+
+# %%
+# the following function takes in a full cos_sim_matrix
+# condition_source: boolean selectors of the source embedding
+# condition_target: boolean selectors of the target embedding
+def find_closest(cos_sim_matrix, condition_source, condition_target):
+    # subset_matrix = cos_sim_matrix[condition_source]
+    # except we are subsetting 2D matrix (row, column)
+    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
+    # we select top k here
+    # Get the indices of the top 5 maximum values along axis 1
+    top_k = 3
+    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
+    # note that top_k_indices is a nested list because of the 2d nature of the matrix
+    # the result is flipped
+    top_k_indices[0] = top_k_indices[0][::-1]
+    
+    # Get the values of the top 5 maximum scores
+    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
+    
+
+    return top_k_indices, top_k_values
+
+
+####################################################
+# special find-back code
+# %%
+def find_back_element_with_print(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
+    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
+    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
+    test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
+    predicted_test_data = test_df[test_df.index == select_idx]['p_mapping']
+    # predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
+    predicted_test_data = predicted_test_data.to_list()[0]
+
+    print("*" * 80)
+    print("idx:", select_idx)
+    print("train desc", training_desc_list)
+    print("train thing+property", training_data_pattern_list)
+    print("test desc", test_desc_list)
+    print("test thing+property", test_data_pattern_list)
+    print("predicted thing+property", predicted_test_data)
+    print("ships idx", test_ship_id)
+    print("score:", top_k_values[0])
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+
+# %%
+def find_back_element(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
+
+    # print(training_data_pattern_list)
+    # print(test_data_pattern_list)
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+
+
+
+# %%
+# for error thing
+pattern_in_train = []
+for select_idx in error_thing_df.index:
+    result = find_back_element_with_print(select_idx)
+    print("status:", result)
+    pattern_in_train.append(result)
+
+sum(pattern_in_train)/len(pattern_in_train)
+
+###
+# for error property
+# %%
+pattern_in_train = []
+for select_idx in error_property_df.index:
+    result = find_back_element_with_print(select_idx)
+    print("status:", result)
+    pattern_in_train.append(result)
+
+sum(pattern_in_train)/len(pattern_in_train)
+
+    
+####################################################
+
+# %%
+# make function to compute similarity of closest retrieved result
+def compute_similarity(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    return np.mean(top_k_values[0])
+
+# %%
+def print_summary(similarity_scores):
+    # Convert list to numpy array for additional stats
+    np_array = np.array(similarity_scores)
+
+    # Get stats
+    mean_value = np.mean(np_array)
+    percentiles = np.percentile(np_array, [25, 50, 75])  # 25th, 50th, and 75th percentiles
+
+    # Display numpy results
+    print("Mean:", mean_value)
+    print("25th, 50th, 75th Percentiles:", percentiles)
+
+
+# %%
+##########################################
+# Analyze the degree of similarity differences between correct and incorrect results
+
+# %%
+# compute similarity scores for all values in error_thing_df
+similarity_thing_scores = []
+for idx in error_thing_df.index:
+    similarity_thing_scores.append(compute_similarity(idx))
+print_summary(similarity_thing_scores)
+
+
+# %%
+similarity_property_scores = []
+for idx in error_property_df.index:
+    similarity_property_scores.append(compute_similarity(idx))
+print_summary(similarity_property_scores)
+
+# %%
+similarity_correct_scores = []
+for idx in correct_df.index:
+    similarity_correct_scores.append(compute_similarity(idx))
+print_summary(similarity_correct_scores)
+
+
+
+# %%
+import matplotlib.pyplot as plt
+
+# Sample data
+list1 = similarity_thing_scores
+list2 = similarity_property_scores
+list3 = similarity_correct_scores
+
+# Plot histograms
+bins = 50
+plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
+plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
+plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
+
+# Labels and legend
+plt.xlabel('Value')
+plt.ylabel('Frequency')
+plt.legend(loc='upper right')
+plt.title('Histograms of Three Lists')
+
+# Show plot
+plt.show()
+
+
+# %%
--- a/analysis/bert/realistic_labels.py
+++ b/analysis/bert/realistic_labels.py
@ -0,0 +1,320 @@
+
+# %%
+import pandas as pd
+from utils import Retriever, cosine_similarity_chunked
+import os
+import glob
+import numpy as np
+
+# %%
+fold = 5
+data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+# subset to mdm
+df = df[df['MDM']]
+
+# create new fields 'mapping' and 'p_mapping'
+# these are analogous to 'pattern', where we combine 'thing' and 'property' without replacing the numbers
+df['mapping'] = df['thing'] + ' ' + df['property']
+df['p_mapping'] = df['p_thing'] + ' ' + df['p_property']
+
+
+thing_condition = df['p_thing'] == df['thing']
+error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
+
+property_condition = df['p_property'] == df['property']
+error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
+
+correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
+
+test_df = df
+
+# %%
+print(len(error_thing_df))
+print(len(error_property_df))
+
+# %%
+# thing_df.to_html('thing_errors.html')
+# property_df.to_html('property_errors.html')
+
+##########################################
+# what we need now is understand why the model is making these mispredictions
+# import train data and test data
+# %%
+class Embedder():
+    input_df: pd.DataFrame
+    fold: int
+
+    def __init__(self, input_df):
+        self.input_df = input_df
+
+
+    def make_embedding(self, checkpoint_path):
+
+        def generate_input_list(df):
+            input_list = []
+            for _, row in df.iterrows():
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                element = f"{desc}{unit}"
+                input_list.append(element)
+            return input_list
+
+        # prepare reference embed
+        train_data = list(generate_input_list(self.input_df))
+        # Define the directory and the pattern
+        retriever_train = Retriever(train_data, checkpoint_path)
+        retriever_train.make_embedding(batch_size=64)
+        return retriever_train.embeddings.to('cpu')
+
+
+
+# %%
+data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+train_df['mapping'] = train_df['thing'] + ' ' + train_df['property']
+
+checkpoint_directory = "../../train/classification_bert_complete_desc_unit"
+directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+# Use glob to find matching paths
+# path is usually checkpoint_fold_1/checkpoint-<step number>
+# we are guaranteed to save only 1 checkpoint from training
+pattern = 'checkpoint-*'
+checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+train_embedder = Embedder(input_df=train_df)
+train_embeds = train_embedder.make_embedding(checkpoint_path)
+
+test_embedder = Embedder(input_df=test_df)
+test_embeds = test_embedder.make_embedding(checkpoint_path)
+
+
+
+# %%
+# test embeds are inputs since we are looking back at train data
+cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
+
+# %%
+# the following function takes in a full cos_sim_matrix
+# condition_source: boolean selectors of the source embedding
+# condition_target: boolean selectors of the target embedding
+def find_closest(cos_sim_matrix, condition_source, condition_target):
+    # subset_matrix = cos_sim_matrix[condition_source]
+    # except we are subsetting 2D matrix (row, column)
+    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
+    # we select top k here
+    # Get the indices of the top 5 maximum values along axis 1
+    top_k = 10
+    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
+    # note that top_k_indices is a nested list because of the 2d nature of the matrix
+    # the result is flipped
+    top_k_indices[0] = top_k_indices[0][::-1]
+    
+    # Get the values of the top 5 maximum scores
+    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
+    
+
+    return top_k_indices, top_k_values
+
+
+####################################################
+# special find-back code
+# %%
+def find_back_element_with_print(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
+    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
+    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
+    test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
+    predicted_test_data = test_df[test_df.index == select_idx]['p_mapping']
+    # predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
+    predicted_test_data = predicted_test_data.to_list()[0]
+
+    print("*" * 80)
+    print("idx:", select_idx)
+    print("train desc", training_desc_list)
+    print("train thing+property", training_data_pattern_list)
+    print("test desc", test_desc_list)
+    print("test thing+property", test_data_pattern_list)
+    print("predicted thing+property", predicted_test_data)
+    print("ships idx", test_ship_id)
+    print("score:", top_k_values[0])
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+
+# %%
+def find_back_element(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
+
+    # print(training_data_pattern_list)
+    # print(test_data_pattern_list)
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+
+
+
+# %%
+# for entire test df
+pattern_in_train = []
+for select_idx in test_df.index:
+    result = find_back_element(select_idx)
+    # print("status:", result)
+    pattern_in_train.append(result)
+
+sum(pattern_in_train)/len(pattern_in_train)
+
+# %%
+# within pattern in train, what is the "correct" rate?
+sub_df = test_df[pattern_in_train]
+result = sub_df['mapping'] == sub_df['p_mapping']
+
+# this is the realistic label result
+print(sum(result)/len(result)) # this is the more realistic result
+
+# %%
+# for pattern not in training data, what is the "correct" rate?
+# within pattern in train, what is the "correct" rate?
+sub_df = test_df[~np.array(pattern_in_train)]
+result = sub_df['mapping'] == sub_df['p_mapping']
+
+print(sum(result)/len(result))
+
+
+# %%
+# for error thing
+pattern_in_train = []
+for select_idx in error_thing_df.index:
+    result = find_back_element_with_print(select_idx)
+    print("status:", result)
+    pattern_in_train.append(result)
+
+sum(pattern_in_train)/len(pattern_in_train)
+
+###
+# for error property
+# %%
+pattern_in_train = []
+for select_idx in error_property_df.index:
+    result = find_back_element_with_print(select_idx)
+    print("status:", result)
+    pattern_in_train.append(result)
+
+sum(pattern_in_train)/len(pattern_in_train)
+
+    
+####################################################
+
+# %%
+# make function to compute similarity of closest retrieved result
+def compute_similarity(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    return np.mean(top_k_values[0])
+
+# %%
+def print_summary(similarity_scores):
+    # Convert list to numpy array for additional stats
+    np_array = np.array(similarity_scores)
+
+    # Get stats
+    mean_value = np.mean(np_array)
+    percentiles = np.percentile(np_array, [25, 50, 75])  # 25th, 50th, and 75th percentiles
+
+    # Display numpy results
+    print("Mean:", mean_value)
+    print("25th, 50th, 75th Percentiles:", percentiles)
+
+
+# %%
+##########################################
+# Analyze the degree of similarity differences between correct and incorrect results
+
+# %%
+# compute similarity scores for all values in error_thing_df
+similarity_thing_scores = []
+for idx in error_thing_df.index:
+    similarity_thing_scores.append(compute_similarity(idx))
+print_summary(similarity_thing_scores)
+
+
+# %%
+similarity_property_scores = []
+for idx in error_property_df.index:
+    similarity_property_scores.append(compute_similarity(idx))
+print_summary(similarity_property_scores)
+
+# %%
+similarity_correct_scores = []
+for idx in correct_df.index:
+    similarity_correct_scores.append(compute_similarity(idx))
+print_summary(similarity_correct_scores)
+
+
+
+# %%
+import matplotlib.pyplot as plt
+
+# Sample data
+list1 = similarity_thing_scores
+list2 = similarity_property_scores
+list3 = similarity_correct_scores
+
+# Plot histograms
+bins = 50
+plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
+plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
+plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
+
+# Labels and legend
+plt.xlabel('Value')
+plt.ylabel('Frequency')
+plt.legend(loc='upper right')
+plt.title('Histograms of Three Lists')
+
+# Show plot
+plt.show()
+
+
+# %%
--- a/analysis/t5/find_closest.py
+++ b/analysis/t5/find_closest.py
@ -7,7 +7,7 @@ import glob
 import numpy as np

 # %%
-data_path = f'../data_preprocess/exports/preprocessed_data.csv'
+data_path = f'../../data_preprocess/exports/preprocessed_data.csv'
 df_pre = pd.read_csv(data_path, skipinitialspace=True)

 # %%
@ -18,8 +18,8 @@ desc_list = df_pre['tag_description'].to_list()
 [ elem for elem in desc_list if isinstance(elem, float)]
 ##########################################
 # %%
-fold = 1
-data_path = f'../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
+fold = 5
+data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)

 # %%
@ -74,10 +74,10 @@ class Embedder():


 # %%
-data_path = f"../data_preprocess/exports/dataset/group_{fold}/train.csv"
+data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)

-checkpoint_directory = "../train/mapping_pattern"
+checkpoint_directory = "../../train/mapping_t5_complete_desc_unit"
 directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
 # Use glob to find matching paths
 # path is usually checkpoint_fold_1/checkpoint-<step number>
@ -199,12 +199,15 @@ for select_idx in error_thing_df.index:
    print("status:", result)
    pattern_in_train.append(result)

+
+# %%
+sum(pattern_in_train)/len(pattern_in_train)
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
-    result = find_back_element_with_print(select_idx)
+    result = find_back_element(select_idx)
    print("status:", result)
    pattern_in_train.append(result)

--- a/analysis/t5/find_closest_mapping_complete.py
+++ b/analysis/t5/find_closest_mapping_complete.py
@ -0,0 +1,334 @@
+
+# %%
+import pandas as pd
+from utils import Retriever, cosine_similarity_chunked
+import os
+import glob
+import numpy as np
+
+# %%
+data_path = f'../../data_preprocess/exports/preprocessed_data.csv'
+df_pre = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+# remove nulls or NAs
+df_pre['tag_description'] = df_pre['tag_description'].fillna("NOVALUE")
+df_pre['tag_description'] = df_pre['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
+
+df_pre['unit'] = df_pre['unit'].fillna("NOVALUE")
+df_pre['unit'] = df_pre['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
+
+
+# %%
+# this should be >0 if we are using abbreviations processed data
+desc_list = df_pre['tag_description'].to_list()
+
+# check for floats
+# we have to eliminate presence of floats
+[ elem for elem in desc_list if isinstance(elem, float)]
+##########################################
+# %%
+fold = 5
+data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+# subset to mdm
+df = df[df['MDM']]
+
+# create new fields 'mapping' and 'p_mapping'
+# these are analogous to 'pattern', where we combine 'thing' and 'property' without replacing the numbers
+df['mapping'] = df['thing'] + ' ' + df['property']
+df['p_mapping'] = df['p_thing'] + ' ' + df['p_property']
+
+
+thing_condition = df['p_thing'] == df['thing']
+error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
+
+property_condition = df['p_property'] == df['property']
+error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
+
+correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
+
+test_df = df
+
+# %%
+# thing_df.to_html('thing_errors.html')
+# property_df.to_html('property_errors.html')
+print(len(error_thing_df))
+print(len(error_property_df))
+
+##########################################
+# what we need now is understand why the model is making these mispredictions
+# import train data and test data
+# %%
+class Embedder():
+    input_df: pd.DataFrame
+    fold: int
+
+    def __init__(self, input_df):
+        self.input_df = input_df
+
+
+    def make_embedding(self, checkpoint_path):
+
+        def generate_input_list(df):
+            input_list = []
+            for _, row in df.iterrows():
+                # name = f"<NAME>{row['tag_name']}<NAME>"
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                # element = f"{name}{desc}"
+                element = f"{desc}{unit}"
+                input_list.append(element)
+            return input_list
+
+        # prepare reference embed
+        train_data = list(generate_input_list(self.input_df))
+        # Define the directory and the pattern
+        retriever_train = Retriever(train_data, checkpoint_path)
+        retriever_train.make_mean_embedding(batch_size=64)
+        return retriever_train.embeddings.to('cpu')
+
+
+
+# %%
+data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+train_df['mapping'] = train_df['thing'] + ' ' + train_df['property']
+
+# remove NAs from train_df
+train_df['tag_description'] = train_df['tag_description'].fillna("NOVALUE")
+train_df['tag_description'] = train_df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
+
+train_df['unit'] = train_df['unit'].fillna("NOVALUE")
+train_df['unit'] = train_df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
+
+
+
+checkpoint_directory = "../../train/mapping_t5_complete_desc_unit"
+directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+# Use glob to find matching paths
+# path is usually checkpoint_fold_1/checkpoint-<step number>
+# we are guaranteed to save only 1 checkpoint from training
+pattern = 'checkpoint-*'
+checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+train_embedder = Embedder(input_df=train_df)
+train_embeds = train_embedder.make_embedding(checkpoint_path)
+
+test_embedder = Embedder(input_df=test_df)
+test_embeds = test_embedder.make_embedding(checkpoint_path)
+
+
+
+# %%
+# test embeds are inputs since we are looking back at train data
+cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
+
+# %%
+# the following function takes in a full cos_sim_matrix
+# condition_source: boolean selectors of the source embedding
+# condition_target: boolean selectors of the target embedding
+def find_closest(cos_sim_matrix, condition_source, condition_target):
+    # subset_matrix = cos_sim_matrix[condition_source]
+    # except we are subsetting 2D matrix (row, column)
+    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
+    # we select top k here
+    # Get the indices of the top 5 maximum values along axis 1
+    top_k = 3
+    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
+    # note that top_k_indices is a nested list because of the 2d nature of the matrix
+    # the result is flipped
+    top_k_indices[0] = top_k_indices[0][::-1]
+    
+    # Get the values of the top 5 maximum scores
+    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
+    
+
+    return top_k_indices, top_k_values
+
+# %%
+error_thing_df.index
+
+####################################################
+# special find-back code
+# %%
+def find_back_element_with_print(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
+    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
+    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
+    predicted_test_data = test_df[test_df.index == select_idx]['p_mapping']
+    # predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
+    predicted_test_data = predicted_test_data.to_list()[0]
+
+    print("*" * 80)
+    print("idx:", select_idx)
+    print("train desc", training_desc_list)
+    print("train thing+property", training_data_pattern_list)
+    print("test desc", test_desc_list)
+    print("test thing+property", test_data_pattern_list)
+    print("predicted thing+property", predicted_test_data)
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+find_back_element_with_print(0)
+
+# %%
+def find_back_element(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
+
+    # print(training_data_pattern_list)
+    # print(test_data_pattern_list)
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+find_back_element(2884)
+
+
+
+# %%
+# for error thing
+pattern_in_train = []
+for select_idx in error_thing_df.index:
+    result = find_back_element_with_print(select_idx)
+    print("status:", result)
+    pattern_in_train.append(result)
+
+
+# %%
+sum(pattern_in_train)/len(pattern_in_train)
+###
+# for error property
+# %%
+pattern_in_train = []
+for select_idx in error_property_df.index:
+    result = find_back_element(select_idx)
+    print("status:", result)
+    pattern_in_train.append(result)
+
+# %%
+sum(pattern_in_train)/len(pattern_in_train)
+
+    
+####################################################
+
+# %%
+# make function to compute similarity of closest retrieved result
+def compute_similarity(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    return np.mean(top_k_values[0])
+
+# %%
+def print_summary(similarity_scores):
+    # Convert list to numpy array for additional stats
+    np_array = np.array(similarity_scores)
+
+    # Get stats
+    mean_value = np.mean(np_array)
+    percentiles = np.percentile(np_array, [25, 50, 75])  # 25th, 50th, and 75th percentiles
+
+    # Display numpy results
+    print("Mean:", mean_value)
+    print("25th, 50th, 75th Percentiles:", percentiles)
+
+
+# %%
+##########################################
+# Analyze the degree of similarity differences between correct and incorrect results
+
+# %%
+# compute similarity scores for all values in error_thing_df
+similarity_thing_scores = []
+for idx in error_thing_df.index:
+    similarity_thing_scores.append(compute_similarity(idx))
+print_summary(similarity_thing_scores)
+
+
+# %%
+similarity_property_scores = []
+for idx in error_property_df.index:
+    similarity_property_scores.append(compute_similarity(idx))
+print_summary(similarity_property_scores)
+
+# %%
+similarity_correct_scores = []
+for idx in correct_df.index:
+    similarity_correct_scores.append(compute_similarity(idx))
+print_summary(similarity_correct_scores)
+
+
+
+# %%
+import matplotlib.pyplot as plt
+
+# Sample data
+list1 = similarity_thing_scores
+list2 = similarity_property_scores
+list3 = similarity_correct_scores
+
+# Plot histograms
+bins = 50
+plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
+plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
+plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
+
+# Labels and legend
+plt.xlabel('Value')
+plt.ylabel('Frequency')
+plt.legend(loc='upper right')
+plt.title('Histograms of Three Lists')
+
+# Show plot
+plt.show()
+
+###########################################
+# %%
+# why do similarities of 97% still map correctly?
+score_array = np.array(similarity_correct_scores)
+# %%
+sum(score_array < 0.95)
+# %%
+correct_df[score_array < 0.95]['tag_description'].index.to_list()
+# %%
--- a/data_preprocess/abbreviations/abbreviations_replacer.py
+++ b/data_preprocess/abbreviations/abbreviations_replacer.py
@ -5,7 +5,7 @@ Modified by: Richard Wong
 # %%
 import re
 import pandas as pd
-from replacement_dict import desc_replacement_dict, unit_replacement_dict
+from replacement_dict_new import desc_replacement_dict, unit_replacement_dict

 # %%
 def count_abbreviation_occurrences(tag_descriptions, abbreviation):
@ -48,20 +48,23 @@ df = pd.read_csv(file_path)
 # %%
 # Replace abbreviations
 print("running substitution for descriptions")
-df['tag_description']= df['tag_description'].fillna("NOVALUE")
+# normalize to uppercase
+# strip leading and trailing whitespace
+df['tag_description'] = df['tag_description'].str.strip()
+df['tag_description'] = df['tag_description'].str.upper()
 # Replace whitespace-only entries with "NOVALUE"
 # note that "N/A" can be read as nan
 # replace whitespace only values as NOVALUE
+df['tag_description']= df['tag_description'].fillna("NOVALUE")
 df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
+
+# perform actual substitution
 tag_descriptions = df['tag_description']
 replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
 replaced_descriptions = cleanup_spaces(replaced_descriptions)
 replaced_descriptions = cleanup_dots(replaced_descriptions)
 df["tag_description"] = replaced_descriptions
 # print("Descriptions after replacement:", replaced_descriptions)
-# strip trailing whitespace
-df['tag_description'] = df['tag_description'].str.rstrip()
-df['tag_description'] = df['tag_description'].str.upper()

 # %%
 print("running substitutions for units")
--- a/data_preprocess/abbreviations/replacement_dict.py
+++ b/data_preprocess/abbreviations/replacement_dict.py
@ -70,7 +70,8 @@ desc_replacement_dict = {
    r'\bD/G\b': 'GENERATOR_ENGINE',
    r'\bGEN\.\b': 'GENERATOR_ENGINE',
    r'\bGENERATOR ENGINE\b': 'GENERATOR_ENGINE',
-    r'\b(\d+)MGE\b': r'NO\1 GENERATOR_ENGINE',
+    # MGE?
+    r'\b(\d+)MGE\b': r'NO\1 MAIN_GENERATOR_ENGINE',
    r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
    r'\bENGINE ROOM\b': 'ENGINE ROOM',
    r'\bE/R\b': 'ENGINE ROOM',
@ -213,4 +214,4 @@ unit_replacement_dict = {
    r'\b°C\b': 'TEMPERATURE',
    r'\bºC\b': 'TEMPERATURE',
    r'\b℃\b': 'TEMPERATURE'
-}
+}
--- a/data_preprocess/abbreviations/replacement_dict_new.py
+++ b/data_preprocess/abbreviations/replacement_dict_new.py
@ -0,0 +1,291 @@
+# substitution mapping for descriptions
+# Abbreviations and their replacements
+desc_replacement_dict = {
+    r'\bLIST\b': 'LIST',
+    # exhaust gas
+    r'\bE\. GAS\b': 'EXHAUST GAS',
+    r'\bEXH\.\b': 'EXHAUST',
+    r'\bEXH\b': 'EXHAUST',
+    r'\bEXHAUST\.\b': 'EXHAUST',
+    r'\bEXHAUST\b': 'EXHAUST',
+    r'\bBLR\.EXH\.\b': 'BOILER EXHAUST',
+    # temperature
+    r'\bTEMP\.\b': 'TEMPERATURE',
+    r'\bTEMP\b': 'TEMPERATURE',
+    r'\bTEMPERATURE\.\b': 'TEMPERATURE',
+    r'\bTEMPERATURE\b': 'TEMPERATURE',
+    # cylinder
+    r'\bCYL(\d+)\b': r'CYLINDER\1',
+    r'\bCYL\.(\d+)\b': r'CYLINDER\1',
+    r'\bCYL(?=\d|\W|$)\b': 'CYLINDER',
+    r'\bCYL\.\b': 'CYLINDER',
+    r'\bCYL\b': 'CYLINDER',
+    # cooling
+    r'\bCOOL\.\b': 'COOLING',
+    r'\bCOOLING\b': 'COOLING',
+    r'\bCOOLER\b': 'COOLER',
+    r'\bCW\b': 'COOLING WATER',
+    r'\bC\.W\b': 'COOLING WATER',
+    r'\bJ\.C\.F\.W\b': 'JACKET COOLING FEED WATER',
+    r'\bJ\.C F\.W\b': 'JACKET COOLING FEED WATER',
+    r'\bJACKET C\.F\.W\b': 'JACKET COOLING FEED WATER',
+    r'\bCOOL\. F\.W\b': 'COOLING FEED WATER',
+    r'\bC\.F\.W\b': 'COOLING FEED WATER',
+    # sea water
+    r'\bC\.S\.W\b': 'COOLING SEA WATER',
+    r'\bCSW\b': 'COOLING SEA WATER',
+    r'\bC.S.W\b': 'COOLING SEA WATER',
+    # water
+    r'\bFEED W\.\b': 'FEED WATER',
+    r'\bFEED W\b': 'FEED WATER',
+    r'\bF\.W\b': 'FEED WATER',
+    r'\bF\.W\.\b': 'FEED WATER',
+    r'\bFW\b': 'FEED WATER',
+    # r'\bWATER\b': 'WATER',
+    r'\bSCAV\.\b': 'SCAVENGE',
+    r'\bSCAV\b': 'SCAVENGE',
+    r'\bINL\.\b': 'INLET',
+    r'\bINLET\b': 'INLET',
+    r'\bOUT\.\b': 'OUTLET',
+    r'\bOUTL\.\b': 'OUTLET',
+    r'\bOUTLET\b': 'OUTLET',
+    # tank
+    r'\bSTOR\.TK\b': 'STORAGE TANK',
+    r'\bSTOR\. TK\b': 'STORAGE TANK',
+    r'\bSERV\. TK\b': 'SERVICE TANK',
+    r'\bSETT\. TK\b': 'SETTLING TANK',
+    r'\bBK\b': 'BUNKER',
+    r'\bTK\b': 'TANK',
+    # PRESSURE
+    r'\bPRESS\b': 'PRESSURE',
+    r'\bPRESS\.\b': 'PRESSURE',
+    r'\bPRESSURE\b': 'PRESSURE',
+    r'PRS\b': 'PRESSURE',  # this is a special replacement - it is safe to replace PRS w/o checks
+    # ENGINE
+    r'\bENG\.\b': 'ENGINE',
+    r'\bENG\b': 'ENGINE',
+    r'\bENGINE\b': 'ENGINE',
+    r'\bENGINE SPEED\b': 'ENGINE SPEED',
+    r'\bENGINE RUNNING\b': 'ENGINE RUNNING',
+    r'\bENGINE RPM PICKUP\b': 'ENGINE RPM PICKUP',
+    r'\bENGINE ROOM\b': 'ENGINE ROOM',
+    r'\bE/R\b': 'ENGINE ROOM',
+    # MAIN ENGINE
+    r'\bM/E NO.(\d+)\b': r'NO\1 MAIN_ENGINE',
+    r'\bM/E NO(\d+)\b': r'NO\1 MAIN_ENGINE',
+    r'\bM/E  NO.(\d+)\b': r'NO\1 MAIN_ENGINE',
+    r'\bME NO.(\d+)\b': r'NO\1 MAIN_ENGINE',
+    r'\bM/E\b': 'MAIN_ENGINE',
+    r'\bM/E(.)\b': r'MAIN_ENGINE \1', # M/E(S/P)
+    r'\bME(.)\b': r'MAIN_ENGINE \1', # ME(S/P)
+    r'\bM_E\b': 'MAIN_ENGINE',
+    r'\bME(?=\d|\W|$)\b': 'MAIN_ENGINE',
+    r'\bMAIN ENGINE\b': 'MAIN_ENGINE',
+    # ENGINE variants
+    r'\bM_E_RPM\b': 'MAIN ENGINE RPM',
+    r'\bM/E_M\.G\.O\.\b': 'MAIN ENGINE MARINE GAS OIL',
+    r'\bM/E_H\.F\.O\.\b': 'MAIN ENGINE HEAVY FUEL OIL',
+    # GENERATOR ENGINE
+    r'\bGEN(\d+)\b': r'NO\1 GENERATOR_ENGINE',
+    r'\bGE(\d+)\b': r'NO\1 GENERATOR_ENGINE',
+    # ensure that we substitute only for terms where following GE is num or special
+    r'\bGE(?=\d|\W|$)\b': 'GENERATOR_ENGINE',
+    r'\bG/E(\d+)\b': r'NO\1 GENERATOR_ENGINE',
+    r'\bG/E\b': r'GENERATOR_ENGINE',
+    r'\bG_E(\d+)\b': r'NO\1 GENERATOR_ENGINE',
+    r'\bG_E\b': 'GENERATOR_ENGINE',
+    r'\bGENERATOR ENGINE\b': 'GENERATOR_ENGINE',
+    r'\bG/E_M\.G\.O\b': 'GENERATOR_ENGINE MARINE GAS OIL',
+    # DG
+    r'\bDG(\d+)\b': r'NO\1 GENERATOR_ENGINE',
+    r'\bDG\b': 'GENERATOR_ENGINE',
+    r'\bD/G\b': 'GENERATOR_ENGINE',
+    r'\bDG(\d+)\((.)\)\b': r'NO\1\2 GENERATOR_ENGINE', # handle DG2(A)
+    r'\bDG(\d+[A-Za-z])\b': r'NO\1 GENERATOR_ENGINE', # handle DG2A
+    # DG variants
+    r'\bDG_CURRENT\b': 'GENERATOR_ENGINE CURRENT',
+    r'\bDG_LOAD\b': 'GENERATOR_ENGINE LOAD',
+    r'\bDG_FREQUENCY\b': 'GENERATOR_ENGINE FREQUENCY',
+    r'\bDG_VOLTAGE\b': 'GENERATOR_ENGINE VOLTAGE',
+    r'\bDG_CLOSED\b': 'GENERATOR_ENGINE CLOSED',
+    r'\bD/G_CURRENT\b': 'GENERATOR_ENGINE CURRENT',
+    r'\bD/G_LOAD\b': 'GENERATOR_ENGINE LOAD',
+    r'\bD/G_FREQUENCY\b': 'GENERATOR_ENGINE FREQUENCY',
+    r'\bD/G_VOLTAGE\b': 'GENERATOR_ENGINE VOLTAGE',
+    r'\bD/G_CLOSED\b': 'GENERATOR_ENGINE CLOSED',
+    # MGE
+    r'\b(\d+)MGE\b': r'NO\1 MAIN_GENERATOR_ENGINE',
+    # generator engine and mgo
+    r'\bG/E_M\.G\.O\.\b': r'GENERATOR_ENGINE MARINE GAS OIL',
+    r'\bG/E_H\.F\.O\.\b': r'GENERATOR_ENGINE HEAVY FUEL OIL',
+    # ultra low sulfur fuel oil
+    r'\bU\.L\.S\.F\.O\b': 'ULTRA LOW SULFUR FUEL OIL',
+    r'\bULSFO\b': 'ULTRA LOW SULFUR FUEL OIL',
+    # marine gas oil
+    r'\bM\.G\.O\b': 'MARINE GAS OIL',
+    r'\bMGO\b': 'MARINE GAS OIL',
+    r'\bMDO\b': 'MARINE DIESEL OIL',
+    # light fuel oil
+    r'\bL\.F\.O\b': 'LIGHT FUEL OIL',
+    r'\bLFO\b': 'LIGHT FUEL OIL',
+    # heavy fuel oil
+    r'\bHFO\b': 'HEAVY FUEL OIL',
+    r'\bH\.F\.O\b': 'HEAVY FUEL OIL',
+    # piston cooling oil
+    r'\bPCO\b': 'PISTON COOLING OIL',
+    r'\bP\.C\.O\.\b': 'PISTON COOLING OIL',
+    r'\bP\.C\.O\b': 'PISTON COOLING OIL',
+    r'PISTION C.O': 'PISTON COOLING OIL',
+    # diesel oil
+    r'\bD.O\b': 'DIESEL OIL',
+    # for remaining fuel oil that couldn't be substituted
+    r'\bF\.O\b': 'FUEL OIL',
+    r'\bFO\b': 'FUEL OIL',
+    # lubricant
+    r'\bLUB\.\b': 'LUBRICANT',
+    r'\bLUBE\b': 'LUBRICANT',
+    r'\bLUBR\.\b': 'LUBRICANT',
+    r'\bLUBRICATING\.\b': 'LUBRICANT',
+    r'\bLUBRICATION\.\b': 'LUBRICANT',
+    # lubricating oil
+    r'\bL\.O\b': 'LUBRICATING OIL',
+    r'\bLO\b': 'LUBRICATING OIL',
+    # lubricating oil pressure
+    r'\bLO_PRESS\b': 'LUBRICATING OIL PRESSURE',
+    r'\bLO_PRESSURE\b': 'LUBRICATING OIL PRESSURE',
+    # temperature
+    r'\bL\.T\b': 'LOW TEMPERATURE',
+    r'\bLT\b': 'LOW TEMPERATURE',
+    r'\bH\.T\b': 'HIGH TEMPERATURE',
+    r'\bHT\b': 'HIGH TEMPERATURE',
+    # BOILER
+    # auxiliary boiler
+    # replace these first before replacing AUXILIARY only
+    r'\bAUX\.BOILER\b': 'AUXILIARY BOILER',
+    r'\bAUX\. BOILER\b': 'AUXILIARY BOILER',
+    r'\bAUX BLR\b': 'AUXILIARY BOILER',
+    r'\bAUX\.\b': 'AUXILIARY',
+    r'\bAUX\b': 'AUXILIARY',
+    # composite boiler
+    r'\bCOMP\. BOILER\b': 'COMPOSITE BOILER',
+    r'\bCOMP\.BOILER\b': 'COMPOSITE BOILER',
+    r'\bCOMP BOILER\b': 'COMPOSITE BOILER',
+    r'\bCOMP\b': 'COMPOSITE',
+    r'\bCMPS\b': 'COMPOSITE',
+    # any other boiler
+    r'\bBLR\.\b': 'BOILER',
+    r'\bBLR\b': 'BOILER',
+    r'\bBOILER W.CIRC.P/P\b': 'BOILER WATER CIRC P/P',
+    # windind
+    r'\bWIND\.\b': 'WINDING',
+    r'\bWINDING\b': 'WINDING',
+    # VOLTAGE/FREQ/CURRENT
+    r'\bVLOT\.': 'VOLTAGE', # correct spelling
+    r'\bVOLT\.': 'VOLTAGE',
+    r'\bVOLTAGE\b': 'VOLTAGE',
+    r'\bFREQ\.': 'FREQUENCY',
+    r'\bFREQUENCY\b': 'FREQUENCY',
+    r'\bCURR\.': 'CURRENT',
+    r'\bCURRENT\b': 'CURRENT',
+    # TURBOCHARGER
+    r'\bTCA\b': 'TURBOCHARGER',
+    r'\bTCB\b': 'TURBOCHARGER',
+    r'\bT/C\b': 'TURBOCHARGER',
+    r'\bT_C\b': 'TURBOCHARGER',
+    r'\bT/C_RPM\b': 'TURBOCHARGER RPM',
+    r'\bTC(\d+)\b': r'TURBOCHARGER\1',
+    r'\bT/C(\d+)\b': r'TURBOCHARGER\1',
+    r'\bTC(?=\d|\W|$)\b': 'TURBOCHARGER',
+    r'\bTURBOCHAGER\b': 'TURBOCHARGER',
+    r'\bTURBOCHARGER\b': 'TURBOCHARGER',
+    r'\bTURBOCHG\b': 'TURBOCHARGER',
+    # misc spelling errors
+    r'\bOPERATOIN\b': 'OPERATION',
+    # wrongly attached terms
+    r'\bBOILERMGO\b': 'BOILER MGO',
+    # additional standardizing replacement
+    # replace # followed by a number with NO
+    r'#(?=\d)\b': 'NO',
+    r'\bNO\.(?=\d)\b': 'NO',
+    r'\bNO\.\.(?=\d)\b': 'NO',
+    # others:
+    # generator
+    r'\bGEN\.\b': 'GENERATOR',
+    # others
+    r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
+    r'\bFLTR\b': 'FILTER',
+    r'\bCLR\b': 'CLEAR',
+}
+
+# substitution mapping for units
+# Abbreviations and their replacements
+unit_replacement_dict = {
+    r'\b%\b': 'PERCENT',
+    r'\b-\b': '',
+    r'\b-  \b': '',
+    # ensure no character after A
+    r'\bA(?!\w|/)': 'CURRENT',
+    r'\bAmp(?!\w|/)': 'CURRENT',
+    r'\bHz\b': 'HERTZ',
+    r'\bKG/CM2\b': 'PRESSURE',
+    r'\bKG/H\b': 'KILOGRAM PER HOUR',
+    r'\bKNm\b': 'RPM',
+    r'\bKW\b': 'POWER',
+    r'\bKg(?!\w|/)': 'MASS',
+    r'\bKw\b': 'POWER',
+    r'\bL(?!\w|/)': 'VOLUME',
+    r'\bMT/h\b': 'METRIC TONNES PER HOUR',
+    r'\bMpa\b': 'PRESSURE',
+    r'\bPF\b': 'POWER FACTOR',
+    r'\bRPM\b': 'RPM',
+    r'\bV(?!\w|/)': 'VOLTAGE',
+    r'\bbar(?!\w|/)': 'PRESSURE',
+    r'\bbarA\b': 'SCAVENGE PRESSURE',
+    r'\bcST\b': 'VISCOSITY',
+    r'\bcSt\b': 'VISCOSITY',
+    r'\bcst\b': 'VISCOSITY',
+    r'\bdeg(?!\w|/|\.)': 'DEGREE',
+    r'\bdeg.C\b': 'TEMPERATURE',
+    r'\bdegC\b': 'TEMPERATURE',
+    r'\bdegree\b': 'DEGREE',
+    r'\bdegreeC\b': 'TEMPERATURE',
+    r'\bhPa\b': 'PRESSURE',
+    r'\bhours\b': 'HOURS',
+    r'\bkN\b': 'THRUST',
+    r'\bkNm\b': 'TORQUE',
+    r'\bkW\b': 'POWER',
+    # ensure that kg is not followed by anything
+    r'\bkg(?!\w|/)': 'FLOW', # somehow in the data its flow
+    r'\bkg/P\b': 'MASS FLOW',
+    r'\bkg/cm2\b': 'PRESSURE',
+    r'\bkg/cm²\b': 'PRESSURE',
+    r'\bkg/h\b': 'MASS FLOW',
+    r'\bkg/hr\b': 'MASS FLOW',
+    r'\bkg/pulse\b': '',
+    r'\bkgf/cm2\b': 'PRESSURE',
+    r'\bkgf/cm²\b': 'PRESSURE',
+    r'\bkgf/㎠\b': 'PRESSURE',
+    r'\bknots\b': 'SPEED',
+    r'\bkw\b': 'POWER',
+    r'\bl/Hr\b': 'VOLUME FLOW',
+    r'\bl/h\b': 'VOLUME FLOW',
+    r'\bl_Hr\b': 'VOLUME FLOW',
+    r'\bl_hr\b': 'VOLUME FLOW',
+    r'\bM\b': 'DRAFT', # for wind draft
+    r'm': 'm', # wind draft and trim - not useful
+    r'\bm/s\b': 'SPEED',
+    r'\bm3\b': 'VOLUME',
+    r'\bmH2O\b': 'DRAFT',
+    r'\bmWC\b': 'DRAFT',
+    r'\bmbar\b': 'PRESSURE',
+    r'\bmg\b': 'ACCELERATION',
+    r'\bmin-¹\b': '', # data too varied
+    r'\bmm\b': '', # data too varied
+    r'\bmmH2O\b': 'WATER DRUM LEVEL',
+    r'\brev\b': 'RPM',
+    r'\brpm\b': 'RPM',
+    r'\bx1000min-¹\b': '',
+    r'\b°C\b': 'TEMPERATURE',
+    r'\bºC\b': 'TEMPERATURE',
+    r'\b℃\b': 'TEMPERATURE'
+}
--- a/data_preprocess/check_data/.gitignore
+++ b/data_preprocess/check_data/.gitignore
@ -0,0 +1 @@
+*.csv
--- a/data_preprocess/check_data/check.py
+++ b/data_preprocess/check_data/check.py
@ -53,6 +53,17 @@ with open('output.txt', 'w') as file:


 # %%
-test = 'kg/cm3'
-print(re.sub(r'kg(?!\w|/)', 'flow', test))
+test = 'M/E(S) something'
+print(re.sub(r'\bM/E(.)', r'MAINE ENGINE \1', test))
 # %%
+test = 'NO.345A ENGINE'
+print(re.sub(r'\bNO\.(?=\d)\b', r'NO', test))
+
+
+
+# %%
+test = 'S/G VLOT.'
+print(re.sub(r'VLOT\.', 'VOLT', test))
+# %%
+description = 'NO3 GENERATOR WINDING TEMPERATURE(T)'
+re.sub(r'\s+', ' ', description)
--- a/data_preprocess/check_data/desc.csv
+++ b/data_preprocess/check_data/desc.csv
--- a/post_process/binary_classifier/classification_prediction/output.txt
+++ b/post_process/binary_classifier/classification_prediction/output.txt
@ -1,31 +1,31 @@

 ********************************************************************************
 Fold: 1
-Accuracy: 0.95342
-F1 Score: 0.91344
-Precision: 0.91643
-Recall: 0.91052
+Accuracy: 0.95174
+F1 Score: 0.90912
+Precision: 0.91788
+Recall: 0.90092
 ********************************************************************************
 Fold: 2
-Accuracy: 0.95402
-F1 Score: 0.92950
-Precision: 0.92122
-Recall: 0.93848
+Accuracy: 0.95159
+F1 Score: 0.92593
+Precision: 0.91697
+Recall: 0.93574
 ********************************************************************************
 Fold: 3
-Accuracy: 0.95200
-F1 Score: 0.92726
-Precision: 0.91825
-Recall: 0.93712
+Accuracy: 0.95373
+F1 Score: 0.93021
+Precision: 0.91935
+Recall: 0.94233
 ********************************************************************************
 Fold: 4
-Accuracy: 0.96473
-F1 Score: 0.92708
-Precision: 0.91566
-Recall: 0.93950
+Accuracy: 0.96524
+F1 Score: 0.92902
+Precision: 0.91306
+Recall: 0.94702
 ********************************************************************************
 Fold: 5
-Accuracy: 0.95605
-F1 Score: 0.92244
-Precision: 0.91755
-Recall: 0.92754
+Accuracy: 0.95643
+F1 Score: 0.92319
+Precision: 0.91793
+Recall: 0.92869
--- a/post_process/binary_classifier/classification_prediction/predict.py
+++ b/post_process/binary_classifier/classification_prediction/predict.py
@ -98,7 +98,7 @@ def test(fold):

    # %%

-    max_length = 64
+    max_length = 128

    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
--- a/post_process/binary_classifier/train.py
+++ b/post_process/binary_classifier/train.py
@ -74,6 +74,15 @@ def create_split_dataset(fold):
    full_df = pd.read_csv(data_path, skipinitialspace=True)
    train_df = full_df[~full_df['ships_idx'].isin(ships_list)]

+    train_ships_list = sorted(list(set(train_df['ships_idx'])))
+
+    train_ships_set = set(train_ships_list)
+    test_ships_set = set(ships_list)
+
+    # assertion for non data leakage
+    assert not set(train_ships_set).intersection(test_ships_set)
+
+
    # valid
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
--- a/post_process/similarity_classifier/output.txt
+++ b/post_process/similarity_classifier/output.txt
@ -0,0 +1,31 @@
+
+Fold: 1
+Best threshold: 0.9775
+Accuracy: 0.92512
+F1 Score: 0.76313
+Precision: 0.78069
+Recall: 0.74633
+Fold: 2
+Best threshold: 0.9775
+Accuracy: 0.92054
+F1 Score: 0.81117
+Precision: 0.77150
+Recall: 0.85514
+Fold: 3
+Best threshold: 0.985
+Accuracy: 0.93201
+F1 Score: 0.83578
+Precision: 0.81657
+Recall: 0.85592
+Fold: 4
+Best threshold: 0.9924999999999999
+Accuracy: 0.95334
+F1 Score: 0.82722
+Precision: 0.83341
+Recall: 0.82112
+Fold: 5
+Best threshold: 0.9924999999999999
+Accuracy: 0.92968
+F1 Score: 0.77680
+Precision: 0.83395
+Recall: 0.72698
--- a/post_process/similarity_classifier/run.py
+++ b/post_process/similarity_classifier/run.py
@ -50,7 +50,8 @@ class Embedder():
            for _, row in df.iterrows():
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
-                element = f"{desc}{unit}"
+                name = f"<NAME>{row['tag_name']}<NAME"
+                element = f"{name}{desc}{unit}"
                input_list.append(element)
            return input_list

@ -64,7 +65,7 @@ class Embedder():


 def run_similarity_classifier(fold):
-    data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
+    data_path = f'../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)


@ -72,7 +73,7 @@ def run_similarity_classifier(fold):
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)

-    checkpoint_directory = "../../train/classification_bert_complete_desc_unit"
+    checkpoint_directory = "../../train/classification_bert_complete_desc_unit_name"
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
@ -109,26 +110,54 @@ def run_similarity_classifier(fold):
        sim_list.append(top_sim_value)

    # analysis 1: using threshold to perform find-back prediction success
-    threshold = 0.90
-    predict_list = [ elem > threshold for elem in sim_list ]
+    threshold_values = np.linspace(0.85, 1.00, 21) # test 20 values, 21 to get nice round numbers
+    best_threshold = 0
+    best_f1 = 0
+    for threshold in threshold_values:
+        predict_list = [ elem > threshold for elem in sim_list ]

+        y_true = test_df['MDM'].to_list()
+        y_pred = predict_list
+
+        # Compute metrics
+        accuracy = accuracy_score(y_true, y_pred)
+        f1 = f1_score(y_true, y_pred)
+        precision = precision_score(y_true, y_pred)
+        recall = recall_score(y_true, y_pred)
+
+        if f1 > best_f1:
+            best_threshold = threshold
+            best_f1 = f1
+
+    # compute metrics again with best threshold
+    predict_list = [ elem > best_threshold for elem in sim_list ]
    y_true = test_df['MDM'].to_list()
    y_pred = predict_list
-
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

-    # Print the results
-    print(f'Accuracy: {accuracy:.5f}')
-    print(f'F1 Score: {f1:.5f}')
-    print(f'Precision: {precision:.5f}')
-    print(f'Recall: {recall:.5f}')
+
+
+    with open("output.txt", "a") as f:
+
+        print(f'Fold: {fold}', file=f)
+        print(f'Best threshold: {best_threshold}', file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+


 # %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)

 for fold in [1,2,3,4,5]:
+    print(fold)
    run_similarity_classifier(fold)
--- a/train/classification_bert_complete_desc/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc/classification_prediction/output.txt
@ -1,31 +1,31 @@

 ********************************************************************************
 Fold: 1
-Accuracy: 0.76337
-F1 Score: 0.37980
-Precision: 0.36508
-Recall: 0.41523
+Accuracy: 0.78277
+F1 Score: 0.73629
+Precision: 0.71419
+Recall: 0.78277
 ********************************************************************************
 Fold: 2
-Accuracy: 0.77430
-F1 Score: 0.40473
-Precision: 0.39528
-Recall: 0.43303
+Accuracy: 0.78598
+F1 Score: 0.73708
+Precision: 0.71578
+Recall: 0.78598
 ********************************************************************************
 Fold: 3
-Accuracy: 0.77259
-F1 Score: 0.39538
-Precision: 0.37761
-Recall: 0.43633
+Accuracy: 0.79819
+F1 Score: 0.74411
+Precision: 0.71749
+Recall: 0.79819
 ********************************************************************************
 Fold: 4
-Accuracy: 0.77545
-F1 Score: 0.39792
-Precision: 0.38636
-Recall: 0.43003
+Accuracy: 0.79543
+F1 Score: 0.73902
+Precision: 0.71094
+Recall: 0.79543
 ********************************************************************************
 Fold: 5
-Accuracy: 0.74897
-F1 Score: 0.38827
-Precision: 0.37680
-Recall: 0.42382
+Accuracy: 0.77279
+F1 Score: 0.72098
+Precision: 0.69817
+Recall: 0.77279
--- a/train/classification_bert_complete_desc/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc/classification_prediction/predict.py
@ -27,6 +27,9 @@ from tqdm import tqdm

 torch.set_float32_matmul_precision('high')

+
+BATCH_SIZE = 256
+
 # %%

 # we need to create the mdm_list
@ -185,7 +188,6 @@ def test(fold):
    actual_labels = []


-    BATCH_SIZE = 64
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
@ -217,9 +219,11 @@ def test(fold):

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
-    f1 = f1_score(y_true, y_pred, average='macro')
-    precision = precision_score(y_true, y_pred, average='macro')
-    recall = recall_score(y_true, y_pred, average='macro')
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)

    with open("output.txt", "a") as f:

--- a/train/classification_bert_complete_desc/train.py
+++ b/train/classification_bert_complete_desc/train.py
@ -57,7 +57,7 @@ for idx, val in enumerate(mdm_list):
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
-        desc = f"{row['tag_description']}"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = f"{row['thing'] + row['property']}"
        try:
            index = mdm_list.index(pattern)
@ -100,7 +100,7 @@ def train(fold):
    # prepare tokenizer

    # model_checkpoint = "distilbert/distilbert-base-uncased"
-    model_checkpoint = 'google-bert/bert-base-uncased'
+    model_checkpoint = 'google-bert/bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -177,8 +177,8 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
-        per_device_train_batch_size=64,
-        per_device_eval_batch_size=64,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
--- a/train/classification_bert_complete_desc_unit/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt
@ -1,31 +1,31 @@

 ********************************************************************************
 Fold: 1
-Accuracy: 0.77946
-F1 Score: 0.40686
-Precision: 0.39833
-Recall: 0.43814
+Accuracy: 0.78940
+F1 Score: 0.73284
+Precision: 0.70389
+Recall: 0.78940
 ********************************************************************************
 Fold: 2
-Accuracy: 0.78271
-F1 Score: 0.42730
-Precision: 0.42002
-Recall: 0.45670
+Accuracy: 0.78411
+F1 Score: 0.73695
+Precision: 0.71914
+Recall: 0.78411
 ********************************************************************************
 Fold: 3
-Accuracy: 0.78715
-F1 Score: 0.41108
-Precision: 0.39829
-Recall: 0.44992
+Accuracy: 0.80522
+F1 Score: 0.75406
+Precision: 0.72847
+Recall: 0.80522
 ********************************************************************************
 Fold: 4
-Accuracy: 0.79115
-F1 Score: 0.41810
-Precision: 0.40095
-Recall: 0.45760
+Accuracy: 0.80780
+F1 Score: 0.75361
+Precision: 0.72432
+Recall: 0.80780
 ********************************************************************************
 Fold: 5
-Accuracy: 0.76271
-F1 Score: 0.41752
-Precision: 0.41156
-Recall: 0.44899
+Accuracy: 0.76958
+F1 Score: 0.71912
+Precision: 0.69965
+Recall: 0.76958
--- a/train/classification_bert_complete_desc_unit/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py
@ -27,6 +27,9 @@ from tqdm import tqdm

 torch.set_float32_matmul_precision('high')

+
+BATCH_SIZE = 128
+
 # %%

 # we need to create the mdm_list
@ -123,7 +126,7 @@ def test(fold):

    # %%

-    max_length = 64
+    max_length = 128

    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
@ -185,7 +188,6 @@ def test(fold):
    actual_labels = []


-    BATCH_SIZE = 64
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
@ -217,9 +219,13 @@ def test(fold):

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
-    f1 = f1_score(y_true, y_pred, average='macro')
-    precision = precision_score(y_true, y_pred, average='macro')
-    recall = recall_score(y_true, y_pred, average='macro')
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+

    with open("output.txt", "a") as f:

--- a/train/classification_bert_complete_desc_unit/train.py
+++ b/train/classification_bert_complete_desc_unit/train.py
@ -57,9 +57,9 @@ for idx, val in enumerate(mdm_list):
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
-        desc = f"{row['tag_description']}"
-        pattern = f"{row['thing'] + row['property']}"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
        unit = f"<UNIT>{row['unit']}<UNIT>"
+        pattern = f"{row['thing'] + row['property']}"
        try:
            index = mdm_list.index(pattern)
        except ValueError:
@ -101,7 +101,7 @@ def train(fold):
    # prepare tokenizer

    # model_checkpoint = "distilbert/distilbert-base-uncased"
-    model_checkpoint = 'google-bert/bert-base-uncased'
+    model_checkpoint = 'google-bert/bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -178,8 +178,8 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
-        per_device_train_batch_size=64,
-        per_device_eval_batch_size=64,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
--- a/train/classification_bert_complete_desc_unit_name/.gitignore
+++ b/train/classification_bert_complete_desc_unit_name/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt
@ -0,0 +1,31 @@
+
+********************************************************************************
+Fold: 1
+Accuracy: 0.68859
+F1 Score: 0.62592
+Precision: 0.60775
+Recall: 0.68859
+********************************************************************************
+Fold: 2
+Accuracy: 0.72150
+F1 Score: 0.65739
+Precision: 0.63652
+Recall: 0.72150
+********************************************************************************
+Fold: 3
+Accuracy: 0.72038
+F1 Score: 0.65781
+Precision: 0.63249
+Recall: 0.72038
+********************************************************************************
+Fold: 4
+Accuracy: 0.74167
+F1 Score: 0.68167
+Precision: 0.65489
+Recall: 0.74167
+********************************************************************************
+Fold: 5
+Accuracy: 0.67705
+F1 Score: 0.61273
+Precision: 0.59472
+Recall: 0.67705
--- a/train/classification_bert_complete_desc_unit_name/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc_unit_name/classification_prediction/predict.py
@ -0,0 +1,248 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 256
+
+# %%
+
+# we need to create the mdm_list
+# import the full mdm-only file
+data_path = '../../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+# mdm_list = sorted(list((set(full_df['pattern']))))
+thing_property = full_df['thing'] + full_df['property']
+thing_property = thing_property.to_list()
+mdm_list = sorted(list(set(thing_property)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(mdm_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df, mdm_list):
+    output_list = []
+    for _, row in df.iterrows():
+        name = f"<NAME>{row['tag_name']}<NAME>"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        unit = f"<UNIT>{row['unit']}<UNIT>"
+
+        pattern = f"{row['thing'] + row['property']}"
+        try:
+            index = mdm_list.index(pattern)
+        except ValueError:
+            index = -1
+        element = {
+            'text' : f"{name}{desc}{unit}",
+            'label': index,
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset(fold, mdm_list):
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+    # we only use the mdm subset
+    test_df = test_df[test_df['MDM']].reset_index(drop=True)
+
+    test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
+
+    return test_dataset
+
+
+# %%
+
+# function to perform training for a given fold
+def test(fold):
+
+    test_dataset = create_dataset(fold, mdm_list)
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint_fold_{fold}'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(mdm_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        print(f'Fold: {fold}', file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+for fold in [1,2,3,4,5]:
+    test(fold)
--- a/train/classification_bert_complete_desc_unit_name/train.py
+++ b/train/classification_bert_complete_desc_unit_name/train.py
@ -0,0 +1,218 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+
+# we need to create the mdm_list
+# import the full mdm-only file
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+# mdm_list = sorted(list((set(full_df['pattern']))))
+thing_property = full_df['thing'] + full_df['property']
+thing_property = thing_property.to_list()
+mdm_list = sorted(list(set(thing_property)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(mdm_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df, mdm_list):
+    output_list = []
+    for _, row in df.iterrows():
+        name = f"<NAME>{row['tag_name']}<NAME>"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        unit = f"<UNIT>{row['unit']}<UNIT>"
+        pattern = f"{row['thing'] + row['property']}"
+        try:
+            index = mdm_list.index(pattern)
+        except ValueError:
+            print("Error: value not found in MDM list")
+            index = -1
+        element = {
+            'text' : f"{name}{desc}{unit}",
+            'label': index,
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset(fold, mdm_list):
+    # train 
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # valid
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
+    })
+    return combined_data
+
+
+# %%
+
+# function to perform training for a given fold
+def train(fold):
+
+    save_path = f'checkpoint_fold_{fold}'
+    split_datasets = create_split_dataset(fold, mdm_list)
+
+    # prepare tokenizer
+
+    # model_checkpoint = "distilbert/distilbert-base-uncased"
+    model_checkpoint = 'google-bert/bert-base-cased'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(mdm_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-5,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=80,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+for fold in [1,2,3,4,5]:
+    print(fold)
+    train(fold)
+
+
+# %%
--- a/train/classification_bert_pattern_desc/train.py
+++ b/train/classification_bert_pattern_desc/train.py
@ -52,7 +52,7 @@ for idx, val in enumerate(mdm_list):
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
-        desc = f"{row['tag_description']}"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = row['pattern']
        try:
            index = mdm_list.index(pattern)
--- a/train/mapping_t5_complete_desc/.gitignore
+++ b/train/mapping_t5_complete_desc/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/mapping_t5_complete_desc/mapping_prediction/.gitignore
+++ b/train/mapping_t5_complete_desc/mapping_prediction/.gitignore
@ -0,0 +1,2 @@
+__pycache__
+exports/
--- a/train/mapping_t5_complete_desc/mapping_prediction/inference.py
+++ b/train/mapping_t5_complete_desc/mapping_prediction/inference.py
@ -0,0 +1,168 @@
+import torch
+from torch.utils.data import DataLoader
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+)
+import os
+from tqdm import tqdm
+from datasets import Dataset
+import numpy as np
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+class Inference():
+    tokenizer: T5TokenizerFast
+    model: torch.nn.Module
+    dataloader: DataLoader
+
+    def __init__(self, checkpoint_path):
+        self._create_tokenizer()
+        self._load_model(checkpoint_path)
+
+
+    def _create_tokenizer(self):
+        # %%
+        # load tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # Define additional special tokens
+        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
+        # Add the additional special tokens to the tokenizer
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    def _load_model(self, checkpoint_path: str):
+        # load model
+        # Define the directory and the pattern
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
+        model = torch.compile(model)
+        # set model to eval
+        self.model = model.eval()
+
+
+
+
+    def prepare_dataloader(self, input_df, batch_size, max_length):
+        """
+        *arguments*
+        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
+        - batch_size: the batch size of dataloader output
+        - max_length: length of tokenizer output
+        """
+        print("preparing dataloader")
+        # convert each dataframe row into a dictionary
+        # outputs a list of dictionaries
+
+        def _process_df(df):
+            output_list = []
+            for _, row in df.iterrows():
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                element = {
+                    'input' : f"{desc}{unit}",
+                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+                }
+                output_list.append(element)
+
+            return output_list
+
+        def _preprocess_function(example):
+            input = example['input']
+            target = example['output']
+            # text_target sets the corresponding label to inputs
+            # there is no need to create a separate 'labels'
+            model_inputs = self.tokenizer(
+                input,
+                text_target=target, 
+                max_length=max_length,
+                return_tensors="pt",
+                padding='max_length',
+                truncation=True,
+            )
+            return model_inputs
+
+        test_dataset = Dataset.from_list(_process_df(input_df))
+
+
+        # map maps function to each "row" in the dataset
+        # aka the data in the immediate nesting
+        datasets = test_dataset.map(
+            _preprocess_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=test_dataset.column_names,
+        )
+        # datasets = _preprocess_function(test_dataset)
+        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # create dataloader
+        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+
+
+    def generate(self):
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        MAX_GENERATE_LENGTH = 128
+
+        pred_generations = []
+        pred_labels = []
+
+        print("start generation")
+        for batch in tqdm(self.dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            pred_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+            self.model.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                outputs = self.model.generate(input_ids,
+                                        attention_mask=attention_mask,
+                                        max_length=MAX_GENERATE_LENGTH)
+                
+                # Decode the output and print the results
+                pred_generations.extend(outputs.to("cpu"))
+
+
+
+        # %%
+        # extract sequence and decode
+        def extract_seq(tokens, start_value, end_value):
+            if start_value not in tokens or end_value not in tokens:
+                return None  # Or handle this case according to your requirements
+            start_id = np.where(tokens == start_value)[0][0]
+            end_id = np.where(tokens == end_value)[0][0]
+
+            return tokens[start_id+1:end_id]
+
+
+        def process_tensor_output(tokens):
+            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
+            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
+            p_thing = None
+            p_property = None
+            if (thing_seq is not None):
+                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
+            if (property_seq is not None):
+                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
+            return p_thing, p_property
+
+        # decode prediction labels
+        def decode_preds(tokens_list):
+            thing_prediction_list = []
+            property_prediction_list = []
+            for tokens in tokens_list:
+                p_thing, p_property = process_tensor_output(tokens)
+                thing_prediction_list.append(p_thing)
+                property_prediction_list.append(p_property)
+            return thing_prediction_list, property_prediction_list 
+
+        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
+        return thing_prediction_list, property_prediction_list
+
--- a/train/mapping_t5_complete_desc/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc/mapping_prediction/output.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9455750118315192
+Accuracy for fold 2: 0.8864485981308411
+Accuracy for fold 3: 0.9558232931726908
+Accuracy for fold 4: 0.9686013320647003
+Accuracy for fold 5: 0.896930829134219
--- a/train/mapping_t5_complete_desc/mapping_prediction/output_with_abbreviation.txt
+++ b/train/mapping_t5_complete_desc/mapping_prediction/output_with_abbreviation.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9588263132986276
+Accuracy for fold 2: 0.9182242990654206
+Accuracy for fold 3: 0.9633534136546185
+Accuracy for fold 4: 0.9809705042816366
+Accuracy for fold 5: 0.8891433806688044
--- a/train/mapping_t5_complete_desc/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc/mapping_prediction/predict.py
@ -0,0 +1,73 @@
+
+import pandas as pd
+import os
+import glob
+from inference import Inference
+
+checkpoint_directory =  '../'
+
+BATCH_SIZE = 512
+
+def infer_and_select(fold):
+    print(f"Inference for fold {fold}")
+    # import test data
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # get target data
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+    # processing to help with selection later
+    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
+
+
+    ##########################################
+    # run inference
+    # checkpoint
+    # Use glob to find matching paths
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+
+    infer = Inference(checkpoint_path)
+    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
+    thing_prediction_list, property_prediction_list = infer.generate()
+
+    # add labels too
+    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
+    # Convert the list to a Pandas DataFrame
+    df_out = pd.DataFrame({
+        'p_thing': thing_prediction_list, 
+        'p_property': property_prediction_list
+    })
+    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
+    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
+
+    # here we want to evaluate mapping accuracy within the valid in mdm data only
+    in_mdm = df['MDM']
+    condition_correct_thing = df['p_thing'] == df['thing']
+    condition_correct_property = df['p_property'] == df['property']
+    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm)
+    pred_correct_proportion = prediction_mdm_correct/sum(in_mdm)
+
+    # write output to file output.txt
+    with open("output.txt", "a") as f:
+        print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f)
+
+###########################################  
+# Execute for all folds
+
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+for fold in [1,2,3,4,5]:
+    infer_and_select(fold)
--- a/train/mapping_t5_complete_desc/train.py
+++ b/train/mapping_t5_complete_desc/train.py
@ -0,0 +1,196 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainer,
+    EarlyStoppingCallback,
+    Seq2SeqTrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# outputs a list of dictionaries
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        element = {
+            'input' : f"{desc}",
+            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset(fold):
+    # train 
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # valid
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
+    })
+    return combined_data
+
+
+# function to perform training for a given fold
+def train(fold):
+    save_path = f'checkpoint_fold_{fold}'
+    split_datasets = create_split_dataset(fold)
+
+    # prepare tokenizer
+
+    model_checkpoint = "t5-small"
+    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['input']
+        target = example['output']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            text_target=target, 
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns=split_datasets["train"].column_names,
+    )
+
+    # https://github.com/huggingface/transformers/pull/28414
+    # model_checkpoint = "google/t5-efficient-tiny"
+    # device_map set to auto to force it to load contiguous weights 
+    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
+    metric = evaluate.load("sacrebleu")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # In case the model returns more than the prediction logits
+        if isinstance(preds, tuple):
+            preds = preds[0]
+
+        decoded_preds = tokenizer.batch_decode(preds, 
+                                            skip_special_tokens=False)
+
+        # Replace -100s in the labels as we can't decode them
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels,
+                                                skip_special_tokens=False)
+
+        # Remove <PAD> tokens from decoded predictions and labels
+        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
+        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
+
+        # Some simple post-processing
+        # decoded_preds = [pred.strip() for pred in decoded_preds]
+        # decoded_labels = [[label.strip()] for label in decoded_labels]
+        # print(decoded_preds, decoded_labels)
+
+        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+        return {"bleu": result["score"]}
+
+
+    # Generation Config
+    # from transformers import GenerationConfig
+    gen_config = model.generation_config
+    gen_config.max_length = 64
+
+    # compile
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # Trainer
+
+    args = Seq2SeqTrainingArguments(
+        f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        predict_with_generate=True,
+        bf16=True,
+        push_to_hub=False,
+        generation_config=gen_config,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Seq2SeqTrainer(
+        model,
+        args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+for fold in [1,2,3,4,5]:
+    print(fold)
+    train(fold)
+
--- a/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9522006625650734
+Accuracy for fold 2: 0.9093457943925234
+Accuracy for fold 3: 0.9678714859437751
+Accuracy for fold 4: 0.9814462416745956
+Accuracy for fold 5: 0.890975721484196
--- a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py
@ -6,6 +6,8 @@ from inference import Inference

 checkpoint_directory =  '../'

+BATCH_SIZE = 512
+
 def infer_and_select(fold):
    print(f"Inference for fold {fold}")
    # import test data
@ -32,7 +34,7 @@ def infer_and_select(fold):


    infer = Inference(checkpoint_path)
-    infer.prepare_dataloader(df, batch_size=256, max_length=128)
+    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
    thing_prediction_list, property_prediction_list = infer.generate()

    # add labels too
--- a/train/mapping_t5_complete_desc_unit_name/.gitignore
+++ b/train/mapping_t5_complete_desc_unit_name/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/.gitignore
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/.gitignore
@ -0,0 +1,2 @@
+__pycache__
+exports/
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/inference.py
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/inference.py
@ -0,0 +1,169 @@
+import torch
+from torch.utils.data import DataLoader
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+)
+import os
+from tqdm import tqdm
+from datasets import Dataset
+import numpy as np
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+class Inference():
+    tokenizer: T5TokenizerFast
+    model: torch.nn.Module
+    dataloader: DataLoader
+
+    def __init__(self, checkpoint_path):
+        self._create_tokenizer()
+        self._load_model(checkpoint_path)
+
+
+    def _create_tokenizer(self):
+        # %%
+        # load tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # Define additional special tokens
+        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
+        # Add the additional special tokens to the tokenizer
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    def _load_model(self, checkpoint_path: str):
+        # load model
+        # Define the directory and the pattern
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
+        model = torch.compile(model)
+        # set model to eval
+        self.model = model.eval()
+
+
+
+
+    def prepare_dataloader(self, input_df, batch_size, max_length):
+        """
+        *arguments*
+        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
+        - batch_size: the batch size of dataloader output
+        - max_length: length of tokenizer output
+        """
+        print("preparing dataloader")
+        # convert each dataframe row into a dictionary
+        # outputs a list of dictionaries
+
+        def _process_df(df):
+            output_list = []
+            for _, row in df.iterrows():
+                name = f"<NAME>{row['tag_name']}<NAME>"
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                element = {
+                    'input' : f"{name}{desc}{unit}",
+                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+                }
+                output_list.append(element)
+
+            return output_list
+
+        def _preprocess_function(example):
+            input = example['input']
+            target = example['output']
+            # text_target sets the corresponding label to inputs
+            # there is no need to create a separate 'labels'
+            model_inputs = self.tokenizer(
+                input,
+                text_target=target, 
+                max_length=max_length,
+                return_tensors="pt",
+                padding='max_length',
+                truncation=True,
+            )
+            return model_inputs
+
+        test_dataset = Dataset.from_list(_process_df(input_df))
+
+
+        # map maps function to each "row" in the dataset
+        # aka the data in the immediate nesting
+        datasets = test_dataset.map(
+            _preprocess_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=test_dataset.column_names,
+        )
+        # datasets = _preprocess_function(test_dataset)
+        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # create dataloader
+        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+
+
+    def generate(self):
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
+        MAX_GENERATE_LENGTH = 128
+
+        pred_generations = []
+        pred_labels = []
+
+        print("start generation")
+        for batch in tqdm(self.dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            pred_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+            self.model.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                outputs = self.model.generate(input_ids,
+                                        attention_mask=attention_mask,
+                                        max_length=MAX_GENERATE_LENGTH)
+                
+                # Decode the output and print the results
+                pred_generations.extend(outputs.to("cpu"))
+
+
+
+        # %%
+        # extract sequence and decode
+        def extract_seq(tokens, start_value, end_value):
+            if start_value not in tokens or end_value not in tokens:
+                return None  # Or handle this case according to your requirements
+            start_id = np.where(tokens == start_value)[0][0]
+            end_id = np.where(tokens == end_value)[0][0]
+
+            return tokens[start_id+1:end_id]
+
+
+        def process_tensor_output(tokens):
+            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
+            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
+            p_thing = None
+            p_property = None
+            if (thing_seq is not None):
+                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
+            if (property_seq is not None):
+                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
+            return p_thing, p_property
+
+        # decode prediction labels
+        def decode_preds(tokens_list):
+            thing_prediction_list = []
+            property_prediction_list = []
+            for tokens in tokens_list:
+                p_thing, p_property = process_tensor_output(tokens)
+                thing_prediction_list.append(p_thing)
+                property_prediction_list.append(p_property)
+            return thing_prediction_list, property_prediction_list 
+
+        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
+        return thing_prediction_list, property_prediction_list
+
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/output.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9465215333648841
+Accuracy for fold 2: 0.9102803738317757
+Accuracy for fold 3: 0.9728915662650602
+Accuracy for fold 4: 0.9843006660323501
+Accuracy for fold 5: 0.8996793403573065
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/predict.py
@ -0,0 +1,73 @@
+
+import pandas as pd
+import os
+import glob
+from inference import Inference
+
+checkpoint_directory =  '../'
+
+BATCH_SIZE = 512
+
+def infer_and_select(fold):
+    print(f"Inference for fold {fold}")
+    # import test data
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # get target data
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+    # processing to help with selection later
+    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
+
+
+    ##########################################
+    # run inference
+    # checkpoint
+    # Use glob to find matching paths
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+
+    infer = Inference(checkpoint_path)
+    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
+    thing_prediction_list, property_prediction_list = infer.generate()
+
+    # add labels too
+    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
+    # Convert the list to a Pandas DataFrame
+    df_out = pd.DataFrame({
+        'p_thing': thing_prediction_list, 
+        'p_property': property_prediction_list
+    })
+    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
+    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
+
+    # here we want to evaluate mapping accuracy within the valid in mdm data only
+    in_mdm = df['MDM']
+    condition_correct_thing = df['p_thing'] == df['thing']
+    condition_correct_property = df['p_property'] == df['property']
+    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm)
+    pred_correct_proportion = prediction_mdm_correct/sum(in_mdm)
+
+    # write output to file output.txt
+    with open("output.txt", "a") as f:
+        print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f)
+
+###########################################  
+# Execute for all folds
+
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+for fold in [1,2,3,4,5]:
+    infer_and_select(fold)
--- a/train/mapping_t5_complete_desc_unit_name/train.py
+++ b/train/mapping_t5_complete_desc_unit_name/train.py
@ -0,0 +1,198 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainer,
+    EarlyStoppingCallback,
+    Seq2SeqTrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# outputs a list of dictionaries
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        name = f"<NAME>{row['tag_name']}<NAME>"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        unit = f"<UNIT>{row['unit']}<UNIT>"
+        element = {
+            'input' : f"{name}{desc}{unit}",
+            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset(fold):
+    # train 
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # valid
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
+    })
+    return combined_data
+
+
+# function to perform training for a given fold
+def train(fold):
+    save_path = f'checkpoint_fold_{fold}'
+    split_datasets = create_split_dataset(fold)
+
+    # prepare tokenizer
+
+    model_checkpoint = "t5-small"
+    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['input']
+        target = example['output']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            text_target=target, 
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns=split_datasets["train"].column_names,
+    )
+
+    # https://github.com/huggingface/transformers/pull/28414
+    # model_checkpoint = "google/t5-efficient-tiny"
+    # device_map set to auto to force it to load contiguous weights 
+    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
+    metric = evaluate.load("sacrebleu")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # In case the model returns more than the prediction logits
+        if isinstance(preds, tuple):
+            preds = preds[0]
+
+        decoded_preds = tokenizer.batch_decode(preds, 
+                                            skip_special_tokens=False)
+
+        # Replace -100s in the labels as we can't decode them
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels,
+                                                skip_special_tokens=False)
+
+        # Remove <PAD> tokens from decoded predictions and labels
+        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
+        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
+
+        # Some simple post-processing
+        # decoded_preds = [pred.strip() for pred in decoded_preds]
+        # decoded_labels = [[label.strip()] for label in decoded_labels]
+        # print(decoded_preds, decoded_labels)
+
+        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+        return {"bleu": result["score"]}
+
+
+    # Generation Config
+    # from transformers import GenerationConfig
+    gen_config = model.generation_config
+    gen_config.max_length = 64
+
+    # compile
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # Trainer
+
+    args = Seq2SeqTrainingArguments(
+        f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=40,
+        predict_with_generate=True,
+        bf16=True,
+        push_to_hub=False,
+        generation_config=gen_config,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Seq2SeqTrainer(
+        model,
+        args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+for fold in [1,2,3,4,5]:
+    print(fold)
+    train(fold)
+
--- a/train/predict.bash
+++ b/train/predict.bash
@ -0,0 +1,27 @@
+#!/bin/bash
+
+cd classification_bert_complete_desc/classification_prediction/
+micromamba run -n hug python predict.py
+cd ../..
+
+cd classification_bert_complete_desc_unit/classification_prediction/
+micromamba run -n hug python predict.py
+cd ../..
+
+cd classification_bert_complete_desc_unit_name/classification_prediction/
+micromamba run -n hug python predict.py
+cd ../..
+
+# cd mapping_t5_complete_desc/mapping_prediction/
+# micromamba run -n hug python predict.py
+# cd ../..
+# 
+# cd mapping_t5_complete_desc_unit/mapping_prediction/
+# micromamba run -n hug python predict.py
+# cd ../..
+# 
+# cd mapping_t5_complete_desc_unit_name/mapping_prediction/
+# micromamba run -n hug python predict.py
+# cd ../..
+
+
--- a/train/train.bash
+++ b/train/train.bash
@ -0,0 +1,25 @@
+#!/bin/bash
+
+# cd classification_bert_complete_desc
+# micromamba run -n hug accelerate launch train.py
+# cd ..
+# 
+# cd classification_bert_complete_desc_unit
+# micromamba run -n hug accelerate launch train.py
+# cd ..
+
+cd classification_bert_complete_desc_unit_name
+micromamba run -n hug accelerate launch train.py
+cd ..
+
+# cd mapping_t5_complete_desc
+# micromamba run -n hug accelerate launch train.py
+# cd ..
+# 
+# cd mapping_t5_complete_desc_unit
+# micromamba run -n hug accelerate launch train.py
+# cd ..
+# 
+# cd mapping_t5_complete_name_desc_unit
+# micromamba run -n hug accelerate launch train.py
+# cd ..