From c5760d127de6cac0f92713476f9f13d1ab6083d9 Mon Sep 17 00:00:00 2001
From: Richard Wong <richard@richardwong.io>
Date: Wed, 18 Dec 2024 13:43:56 +0900
Subject: [PATCH] Feat: added post_processing based on rules

others:
- added basic data analysis to get histograms of text differences
- added new final delivery model
---
 .gitignore                                    |   3 +-
 analysis/categories/label_print.py            |  17 ++
 .../basic_eda.py                              |   2 +-
 analysis/data_properties/ship_counts.py       |  71 ++++++
 analysis/split_analysis/bgkf_vs_gkf.py        |  38 +++
 .../between_ship_and_platform.py              |  72 ++++++
 .../string_levenshtein/within_same_class.py   |  88 +++++++
 data_preprocess/split_data.py                 |   6 +-
 interpretation/fold_analysis_t5.py            |   2 +-
 post_process/rule_based_correction/.gitignore |   3 +
 .../post_processing_rules.py                  | 154 ++++++++++++
 .../post_processor_gawi.py                    | 233 ++++++++++++++++++
 {end_to_end => production}/.gitignore         |   0
 end_to_end/.README.md => production/README.md |   0
 .../end_to_end}/deduplication.py              |   2 +-
 .../end_to_end}/mapper.py                     |   0
 .../end_to_end/post_processing_rules.py       | 158 ++++++++++++
 .../end_to_end}/preprocess.py                 |   2 +-
 .../end_to_end}/replacement_dict.py           |   0
 .../end_to_end/rule_based_correction.py       | 122 +++++++++
 {end_to_end => production}/run.py             |  17 +-
 .../.gitignore                                |   2 +
 .../mapping_t5_final_delivery_model/train.py  | 196 +++++++++++++++
 23 files changed, 1178 insertions(+), 10 deletions(-)
 rename analysis/{ship_data_list => data_properties}/basic_eda.py (83%)
 create mode 100644 analysis/data_properties/ship_counts.py
 create mode 100644 analysis/split_analysis/bgkf_vs_gkf.py
 create mode 100644 analysis/string_levenshtein/between_ship_and_platform.py
 create mode 100644 analysis/string_levenshtein/within_same_class.py
 create mode 100644 post_process/rule_based_correction/.gitignore
 create mode 100644 post_process/rule_based_correction/post_processing_rules.py
 create mode 100644 post_process/rule_based_correction/post_processor_gawi.py
 rename {end_to_end => production}/.gitignore (100%)
 rename end_to_end/.README.md => production/README.md (100%)
 rename {end_to_end => production/end_to_end}/deduplication.py (99%)
 rename {end_to_end => production/end_to_end}/mapper.py (100%)
 create mode 100644 production/end_to_end/post_processing_rules.py
 rename {end_to_end => production/end_to_end}/preprocess.py (97%)
 rename {end_to_end => production/end_to_end}/replacement_dict.py (100%)
 create mode 100644 production/end_to_end/rule_based_correction.py
 rename {end_to_end => production}/run.py (82%)
 create mode 100644 train/mapping_t5_final_delivery_model/.gitignore
 create mode 100644 train/mapping_t5_final_delivery_model/train.py

diff --git a/.gitignore b/.gitignore
index 6f66c74..6282acb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-*.zip
\ No newline at end of file
+*.zip
+post_processor
\ No newline at end of file
diff --git a/analysis/categories/label_print.py b/analysis/categories/label_print.py
index a69711c..29de0ec 100644
--- a/analysis/categories/label_print.py
+++ b/analysis/categories/label_print.py
@@ -7,6 +7,14 @@ full_df = pd.read_csv(data_path, skipinitialspace=True)
 mdm_list = sorted(list((set(full_df['pattern']))))
 
 
+# %%
+full_df
+
+# %%
+mask1 = full_df['thing'] == 'ME1TurboCharger1'
+mask2 = full_df['property'] == 'LOInletPress'
+mask = mask1 & mask2
+full_df[mask]
 # %%
 len(mdm_list)
 # %%
@@ -16,3 +24,12 @@ tp_list = sorted(list(set(thing_property)))
 # %%
 len(tp_list)
 # %%
+data_path = '../../data_import/exports/raw_data.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+bad_df = df[~df['MDM']]
+# %%
+bad_df[bad_df['thing'] == '$UNMAPPED']
+
+# %%
diff --git a/analysis/ship_data_list/basic_eda.py b/analysis/data_properties/basic_eda.py
similarity index 83%
rename from analysis/ship_data_list/basic_eda.py
rename to analysis/data_properties/basic_eda.py
index daa4224..3b0bf98 100644
--- a/analysis/ship_data_list/basic_eda.py
+++ b/analysis/data_properties/basic_eda.py
@@ -9,5 +9,5 @@ df = pd.read_csv(data_path)
 df
 
 # %%
-set(df['signal_type'])
+len(set(df['ships_idx']))
 # %%
diff --git a/analysis/data_properties/ship_counts.py b/analysis/data_properties/ship_counts.py
new file mode 100644
index 0000000..6f46042
--- /dev/null
+++ b/analysis/data_properties/ship_counts.py
@@ -0,0 +1,71 @@
+# %%
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+# note: we assume that you will execute from the directory of this code
+# check your current directory
+print("Current Working Directory:", os.getcwd())
+
+
+# %%
+# plt.rcParams.update({'font.size': 18})
+
+df = pd.read_csv('../../data_import/exports/raw_data.csv')
+total_counts = df['ships_idx'].value_counts().sort_index()
+
+mdm_true_counts = df[df['MDM']]['ships_idx'].value_counts().sort_index()
+
+summary_df = pd.DataFrame({
+    'SD': total_counts,
+    'PD': mdm_true_counts
+}).fillna(0)
+
+total_SD = summary_df['SD'].sum()
+total_PD = summary_df['PD'].sum()
+
+print(f"Total SD: {total_SD}")
+print(f"Total PD: {total_PD}")
+
+# %%
+
+plt.figure(figsize=(8, 6)) 
+fig, ax = plt.subplots(figsize=(8, 6))
+
+summary_df['SD'].plot(
+    kind='bar',
+    ax=ax,
+    color='orange',
+    alpha=0.5,
+    label='Ship Domain',
+    width=0.8)
+
+summary_df['PD'].plot(
+    kind='bar',
+    ax=ax,
+    color='blue',
+    alpha=0.7,
+    label='Platform Domain',
+    width=0.8)
+
+x_labels = ax.get_xticks()
+ax.set_xticks(np.arange(min(x_labels), max(x_labels) + 1, 10))
+ax.set_xticklabels(
+    [int(label) for label in np.arange(min(x_labels), max(x_labels) + 1, 10)],
+    rotation=0,
+)
+
+ax.grid(True)
+
+# plt.legend(prop={'size': 18})
+plt.legend()
+plt.ylabel('Counts')
+plt.xlabel('Ships')
+
+plt.savefig('count_statistics_of_each_ship.png')
+
+plt.show()
+
+
+# %%
diff --git a/analysis/split_analysis/bgkf_vs_gkf.py b/analysis/split_analysis/bgkf_vs_gkf.py
new file mode 100644
index 0000000..692e1be
--- /dev/null
+++ b/analysis/split_analysis/bgkf_vs_gkf.py
@@ -0,0 +1,38 @@
+# %%
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+# note: we assume that you will execute from the directory of this code
+# check your current directory
+print("Current Working Directory:", os.getcwd())
+
+
+
+# %%
+data_path = "../../data_preprocess/exports/combined_group_allocation.csv"
+df = pd.read_csv(data_path)
+
+# %%
+df
+# %%
+print('mean', df[df['Allocation'] == 'BGKF']['Comb_count'].mean())
+print('std', df[df['Allocation'] == 'BGKF']['Comb_count'].std())
+max = df[df['Allocation'] == 'BGKF']['Comb_count'].max()
+min = df[df['Allocation'] == 'BGKF']['Comb_count'].min()
+print('max', max)
+print('max', min)
+print('max - min', max - min)
+
+# %%
+print('mean', df[df['Allocation'] == 'GKF']['Comb_count'].mean())
+print('std', df[df['Allocation'] == 'GKF']['Comb_count'].std())
+max = df[df['Allocation'] == 'GKF']['Comb_count'].max()
+min = df[df['Allocation'] == 'GKF']['Comb_count'].min()
+print('max', max)
+print('max', min)
+print('max - min', max - min)
+
+
+# %%
diff --git a/analysis/string_levenshtein/between_ship_and_platform.py b/analysis/string_levenshtein/between_ship_and_platform.py
new file mode 100644
index 0000000..02c796f
--- /dev/null
+++ b/analysis/string_levenshtein/between_ship_and_platform.py
@@ -0,0 +1,72 @@
+# %%
+import pandas as pd
+import Levenshtein
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+
+# %%
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+mdm_list = sorted(list((set(df['pattern']))))
+
+
+# %%
+df['thing_property'] = df['thing'] + df['property']
+
+# %%
+def compute_norm_leven(string1, string2):
+    max_distance = max(len(string1), len(string2))
+    leven_distance = Levenshtein.distance(string1, string2)
+    norm_leven = leven_distance / max_distance
+    return norm_leven
+
+# %%
+n = len(df)
+distance_array = np.zeros((n), dtype=float)
+
+desc_array = df['tag_description']
+thing_property_array = df['thing_property']
+
+# %%
+# compute normalized levenshtein distance
+for i in tqdm(range(n)):
+    string1 = desc_array[i]
+    string2 = thing_property_array[i]
+    distance_array[i] = compute_norm_leven(string1, string2)
+
+# %%
+distance_array
+
+
+# %%
+plt.figure(figsize=(8, 6)) 
+plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
+plt.xlabel("Normalized Levenshtein Distance")
+plt.ylabel("Count")
+plt.tight_layout()
+plt.savefig("histogram.png", dpi=300)
+#
+# %%
+# summary statistics of computed levenshtein distance
+def summary_stats(arr):
+    return {
+        "Mean": np.mean(arr),
+        "Median": np.median(arr),
+        "Standard Deviation": np.std(arr),
+        "Variance": np.var(arr),
+        "Min": np.min(arr),
+        "Max": np.max(arr),
+        "Range": np.ptp(arr),
+        "25th Percentile": np.percentile(arr, 25),
+        "75th Percentile": np.percentile(arr, 75),
+        "Sum": np.sum(arr),
+    }
+
+stats = summary_stats(distance_array)
+
+for key, value in stats.items():
+    print(f"{key}: {value}")
+
+# %%
diff --git a/analysis/string_levenshtein/within_same_class.py b/analysis/string_levenshtein/within_same_class.py
new file mode 100644
index 0000000..eaf51d6
--- /dev/null
+++ b/analysis/string_levenshtein/within_same_class.py
@@ -0,0 +1,88 @@
+# %%
+import pandas as pd
+import Levenshtein
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+
+# %%
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+df = pd.read_csv(data_path, skipinitialspace=True)
+df['thing_property'] = df['thing'] + df['property']
+mdm_list = sorted(list((set(df['thing_property']))))
+
+
+# %%
+def compute_norm_leven(string1, string2):
+    max_distance = max(len(string1), len(string2))
+    leven_distance = Levenshtein.distance(string1, string2)
+    norm_leven = leven_distance / max_distance
+    return norm_leven
+
+def compute_avg_score(strings):
+    n = len(strings)
+
+    # if group only has 1 string, then it is fully similar to itself
+    if n == 1:
+        return 0
+
+    # Create an empty matrix
+    distance_matrix = np.zeros((n, n), dtype=float)
+
+    # Fill only the upper triangular part
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = compute_norm_leven(strings[i], strings[j])
+            distance_matrix[i, j] = dist
+
+    upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
+    mean_distance = np.mean(upper_triangular_distances)
+    return mean_distance
+
+
+# %%
+# we want to subset to each class
+n = len(mdm_list)
+score_list = np.zeros((n), dtype=float)
+
+for i in range(n):
+    df_subset = df[df['thing_property'] == mdm_list[i]]
+    strings = df_subset['tag_description'].to_numpy()
+    score_list[i] = compute_avg_score(strings)
+
+
+# %%
+score_list
+
+
+# %%
+# plt.hist(score_list, bins=50)
+plt.figure(figsize=(8, 6)) 
+plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
+plt.xlabel("Normalized Levenshtein Distance")
+plt.ylabel("Platform Domain Class Count")
+plt.tight_layout()
+plt.savefig("histogram.png", dpi=300)
+# %%
+# summary statistics of computed levenshtein distance
+def summary_stats(arr):
+    return {
+        "Mean": np.mean(arr),
+        "Median": np.median(arr),
+        "Standard Deviation": np.std(arr),
+        "Variance": np.var(arr),
+        "Min": np.min(arr),
+        "Max": np.max(arr),
+        "Range": np.ptp(arr),
+        "25th Percentile": np.percentile(arr, 25),
+        "75th Percentile": np.percentile(arr, 75),
+        "Sum": np.sum(arr),
+    }
+
+stats = summary_stats(score_list)
+
+for key, value in stats.items():
+    print(f"{key}: {value}")
+
+# %%
diff --git a/data_preprocess/split_data.py b/data_preprocess/split_data.py
index 50576e7..32d2326 100644
--- a/data_preprocess/split_data.py
+++ b/data_preprocess/split_data.py
@@ -181,6 +181,7 @@ remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
 
 # Allocate remaining ships to the groups
 while len(remaining_ships) > 0:
+    # re-compute the counts for each group
     group_comb_counts = []
     for g in range(num_groups):
         group_ships = groups[g]
@@ -190,18 +191,21 @@ while len(remaining_ships) > 0:
 
     group_comb_counts.sort(key=lambda x: x[1])
     
+    # reset the remaining_group list
     remaining_group = []
+    # g is the identifier for the group
     for g, _ in group_comb_counts:
         if len(remaining_ships) == 0:
             break
         
+        # compute for each group, the selected ship, and the combined count increase
         if group_comb_counts.index((g, _)) == 0:
             selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
-                
         else:
             max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
             selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
 
+        # if the combined increase is 0, then we process it in a special manner
         if comb_increase == 0:
             remaining_group.append(g)
         else:
diff --git a/interpretation/fold_analysis_t5.py b/interpretation/fold_analysis_t5.py
index 9144972..8900b8d 100644
--- a/interpretation/fold_analysis_t5.py
+++ b/interpretation/fold_analysis_t5.py
@@ -2,7 +2,7 @@
 # %%
 import pandas as pd
 import os
-from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
+from inference import Inference, Embedder_t5_encoder
 import numpy as np
 from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
diff --git a/post_process/rule_based_correction/.gitignore b/post_process/rule_based_correction/.gitignore
new file mode 100644
index 0000000..11fdf6f
--- /dev/null
+++ b/post_process/rule_based_correction/.gitignore
@@ -0,0 +1,3 @@
+*.csv
+fold_*
+__pycache__
\ No newline at end of file
diff --git a/post_process/rule_based_correction/post_processing_rules.py b/post_process/rule_based_correction/post_processing_rules.py
new file mode 100644
index 0000000..bbf723f
--- /dev/null
+++ b/post_process/rule_based_correction/post_processing_rules.py
@@ -0,0 +1,154 @@
+# Shaft# class post-processing
+shaft_rules = [
+    {   "conditions": [
+            lambda x: "NO.1" in x,
+            lambda x: "SHAFT" in x
+        ],
+        "action": 'Shaft1'
+    },
+    {   "conditions": [
+            lambda x: "NO.1" in x,
+            lambda x: "Shaft" in x
+        ],
+        "action": 'Shaft1'
+    },
+    {   "conditions": [
+            lambda x: "NO.2" in x,
+            lambda x: "Shaft" in x
+        ],
+        "action": 'Shaft2'
+    },
+    {   "conditions": [
+            lambda x: "NO.2" in x,
+            lambda x: "SHAFT" in x
+        ],
+        "action": 'Shaft2'
+    },
+
+    {   "conditions": [
+            lambda x: "NO.1" not in x,
+            lambda x: "NO.2" not in x,
+            lambda x: "SHAFT" not in x
+        ],
+        "action": 'Shaft1'
+    },
+
+    {   "conditions": [
+            lambda x: "NO.1" not in x,
+            lambda x: "NO.2" not in x,
+            lambda x: "SHAFT" in x,
+            lambda x: "(P)" in x
+        ],
+        "action": 'Shaft2'
+    },
+
+    {   "conditions": [
+            lambda x: "NO.1" not in x,
+            lambda x: "NO.2" not in x,
+            lambda x: "SHAFT" in x,
+            lambda x: "(S)" in x
+        ],
+        "action": 'Shaft1'
+    },
+
+    {   "conditions": [
+            lambda x: "NO.1" not in x,
+            lambda x: "NO.2" not in x,
+            lambda x: "SHAFT" in x,
+            lambda x: "(S)" not in x,
+            lambda x: "(P)" not in x
+        ],
+        "action": 'Shaft1'
+    },
+]
+
+
+# ME# class post-processing
+ME_rules = [
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "(P)" not in x,
+            lambda x: "(S)" not in x,
+            lambda x: "GE" not in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "(P)" in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME2Flow'
+    },
+
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "(S)" in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+
+
+    {   "conditions": [
+            lambda x: "ME" not in x,
+            lambda x: "GE" not in x,
+            lambda x: "(P)" not in x,
+            lambda x: "(S)" not in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "GE" not in x,
+            lambda x: "(P)" not in x,
+            lambda x: "(S)" not in x,
+            lambda x: "FLOW" not in x,
+            lambda x: "CONSUMPTION" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+]
+
+# GEFlow rules
+GEFlow_rules = [
+        {   "conditions": [lambda x: "NO." not in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE1Flow'
+    },
+    {   "conditions": [ lambda x: "NO.1" in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE1Flow'
+    },
+
+        {   "conditions": [ lambda x: "NO.2" in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE2Flow'
+    },
+        {   "conditions": [ lambda x: "NO.3" in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE3Flow'
+    },
+
+        {   "conditions": [ lambda x: "NO." not in x,
+            lambda x: "GE" in x,
+            lambda x: "CONSUMPTION" in x,
+
+        ],
+        "action": 'GE1Flow'
+    },
+]
\ No newline at end of file
diff --git a/post_process/rule_based_correction/post_processor_gawi.py b/post_process/rule_based_correction/post_processor_gawi.py
new file mode 100644
index 0000000..6529a52
--- /dev/null
+++ b/post_process/rule_based_correction/post_processor_gawi.py
@@ -0,0 +1,233 @@
+# %%
+import os
+import re
+
+import pandas as pd
+from sklearn.metrics import classification_report
+
+from post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
+
+
+# %%
+# Function to print classification metrics
+def print_classification_metrics(df):
+    print("Results before post-processing")
+    # print(classification_report(df["thing"], df["p_thing"].fillna("")))
+    report = classification_report(df["thing"], df["p_thing"].fillna(""), output_dict=True)
+    # Extracting the weighted average values for precision, recall, and F1-score
+    precision_avg = report['weighted avg']['precision']
+    recall_avg = report['weighted avg']['recall']
+    f1_avg = report['weighted avg']['f1-score']
+    # Print the averages
+    print("Average Precision:", precision_avg)
+    print("Average Recall:", recall_avg)
+    print("Average F1-Score:", f1_avg)
+
+    print("**************")
+
+    print("Results after post-processing")
+    # print(classification_report(df["thing"], df["edited_p_thing"].fillna("")))
+    report = classification_report(df["thing"], df["edited_p_thing"].fillna(""), output_dict=True)
+    # Extracting the weighted average values for precision, recall, and F1-score
+    precision_avg = report['weighted avg']['precision']
+    recall_avg = report['weighted avg']['recall']
+    f1_avg = report['weighted avg']['f1-score']
+    # Print the averages
+    print("Average Precision:", precision_avg)
+    print("Average Recall:", recall_avg)
+    print("Average F1-Score:", f1_avg)
+
+
+def read_data():
+    BASE_FOLDER = "fold"
+    BASE_FILE_NAME = "result_group"
+    NUM_FOLDS = 5
+
+    #  List for storing all DataFrames
+    dataframes = []
+
+    # Iterate over all fold
+    for i in range(NUM_FOLDS):
+        fold_folder = f"{BASE_FOLDER}_{i+1}"
+        desired_file_name = f"{BASE_FILE_NAME}_{i+1}.csv"
+        for file_name in os.listdir(fold_folder):
+            if file_name==desired_file_name:
+                file_path = os.path.join(fold_folder, file_name)
+
+                df = pd.read_csv(file_path, index_col=0)
+                dataframes.append(df)
+
+    # Combine all DataFrames into one
+    final_dataframe = pd.concat(dataframes)
+    final_dataframe = final_dataframe[final_dataframe["MDM"]].reset_index(drop=True)
+    # assign a copy
+    final_dataframe["edited_p_thing"] = final_dataframe["p_thing"]
+    return final_dataframe
+
+
+def update_shipboiler_p_thing(df, tag_column='tag_description', thing_pred_column='edited_p_thing'):
+    """
+    Update the 'thing' column in the DataFrame based on rules applied to 'tag_description' column.
+    
+    Parameters:
+    - df: DataFrame to apply the function on.
+    - tag_column: Column name containing descriptions to base the logic on.
+    - thing_pred_column: Column to be updated with new values based on conditions.
+    
+    Returns:
+    - DataFrame with updated 'thing' column.
+    """
+    # Fill NaN values in tag column to avoid errors with ~ operator
+    df[tag_column] = df[tag_column].fillna('')
+    
+    # Apply rules for "NO.1", "NO.2", and "AUXILIARY" in tag_column
+    df.loc[df[tag_column].str.contains("NO.1", case=False), thing_pred_column] = "ShipBoiler1"
+    df.loc[df[tag_column].str.contains("NO.2", case=False), thing_pred_column] = "ShipBoiler2"
+    df.loc[(df[tag_column].str.contains("AUXILIARY", case=False)) & (~df[tag_column].str.contains("NO.", case=False)), thing_pred_column] = "ShipBoiler1"
+    
+    # Determine the highest number for "thing" in rows without "COMPOSITE" and assign to "COMPOSITE" rows
+    max_boiler_number = df.loc[~df[tag_column].str.contains("COMPOSITE", case=False) & df[thing_pred_column].str.contains("ShipBoiler", na=False), thing_pred_column] \
+                           .str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
+    composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
+    
+    # Apply composite_boiler to rows containing "COMPOSITE"
+    df.loc[df[tag_column].str.contains("COMPOSITE", case=False), thing_pred_column] = composite_boiler
+    
+    return df
+
+
+def process_boiler_data(boiler_data):
+    updated_dataframes = []
+    for _, group in boiler_data.groupby('ships_idx'):
+        group_copy = group.copy()
+
+        group_copy["tag_description"] = group_copy["tag_description"].fillna('')
+
+        contains_no1_cond = group_copy["tag_description"].str.contains("NO.1")
+        contains_no2_cond = group_copy["tag_description"].str.contains("NO.2")
+        contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
+        doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO.")
+        contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
+        contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
+
+        group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
+        group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
+        group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
+
+        if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
+            max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
+            composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
+
+            if max_boiler_number > 3:
+                max_boiler_number = 3
+
+            group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
+        else:
+            group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
+
+        updated_dataframes.append(group_copy)   # Collect updated group
+
+        # Step 2: Concatenate all updated groups
+    updated_boiler_data = pd.concat(updated_dataframes)
+    return updated_boiler_data
+
+
+def check_conditions(value,conditions):
+# Check if a value satisfies all conditions
+    return all(condition(value) for condition in conditions)
+
+
+def apply_rules(description, thing, rules):
+#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
+    for rule in rules:
+        if check_conditions(description, rule["conditions"]): #Check that all conditions are met
+            return rule["action"]  #Execute the action and return the result
+    return thing  #Returns the value of the thing column if it doesn't match any of the rules
+
+
+# %%
+# if __name__ == "__main__":
+
+# %%
+# Read and preprocess data
+final_dataframe = read_data()
+
+# %%
+final_dataframe
+# %%
+# Hwanggawi main function
+#Get partial columns
+TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
+
+#Shaft
+SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
+SF_df_in_MDM = SF_df[(SF_df['MDM'])]
+
+#ME
+ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
+                TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
+ME_df_in_MDM = ME_df[(ME_df['MDM'])]
+
+#GE
+GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
+                TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
+GE_df_in_MDM = GE_df[(GE_df['MDM'])]
+
+SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
+GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
+ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
+
+# ShipBoiler class post-processing
+mdm = final_dataframe[final_dataframe["MDM"]].copy()
+boiler_data = mdm[mdm["thing"].str.contains("Boiler")].copy()
+
+blr_cond = boiler_data["tag_description"].str.lower().str.contains("blr")
+boiler_cond = boiler_data["tag_description"].str.lower().str.contains("boiler")
+boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
+different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
+
+unique_ships_idxs = boiler_data["ships_idx"].unique()
+
+boiler_data["edited_p_thing"] = boiler_data["p_thing"]
+
+updated_boiler_data = process_boiler_data(boiler_data)
+
+# Save updated data back to the original DataFrame
+final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
+
+
+result = SF_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
+SF_df_in_MDM['edited_p_thing'] = result
+
+# Save updated data back to the original DataFrame
+final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
+
+
+result = ME_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
+ME_df_in_MDM['edited_p_thing'] = result
+final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
+
+
+result = GE_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
+GE_df_in_MDM['edited_p_thing'] = result
+final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
+
+
+# output final dataframe
+final_dataframe.to_csv("post_processed_df.csv", index=False)
+print("Saved output to post_processed_df.csv")
+# %%
+# print results
+print("ShipBoiler post-processing results")
+print_classification_metrics(updated_boiler_data)
+print("----------------------------")
+
+print("ShipBoiler post-processing results")
+print_classification_metrics(ME_df_in_MDM)
+print("----------------------------")
+
+print("ShipBoiler post-processing results")
+print_classification_metrics(GE_df_in_MDM)
+print("----------------------------")
+
+# %%
diff --git a/end_to_end/.gitignore b/production/.gitignore
similarity index 100%
rename from end_to_end/.gitignore
rename to production/.gitignore
diff --git a/end_to_end/.README.md b/production/README.md
similarity index 100%
rename from end_to_end/.README.md
rename to production/README.md
diff --git a/end_to_end/deduplication.py b/production/end_to_end/deduplication.py
similarity index 99%
rename from end_to_end/deduplication.py
rename to production/end_to_end/deduplication.py
index 37073ad..802e468 100644
--- a/end_to_end/deduplication.py
+++ b/production/end_to_end/deduplication.py
@@ -265,7 +265,7 @@ def run_deduplication(
         print('generate train embeddings')
         train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
         tensor = train_embedder.make_embedding(checkpoint_path)
-        torch.save(tensor, file_path, weights_only=True)
+        torch.save(tensor, file_path)
         print("Tensor saved to file.")
     
     train_embeds = tensor
diff --git a/end_to_end/mapper.py b/production/end_to_end/mapper.py
similarity index 100%
rename from end_to_end/mapper.py
rename to production/end_to_end/mapper.py
diff --git a/production/end_to_end/post_processing_rules.py b/production/end_to_end/post_processing_rules.py
new file mode 100644
index 0000000..c20bfeb
--- /dev/null
+++ b/production/end_to_end/post_processing_rules.py
@@ -0,0 +1,158 @@
+# Shaft# class post-processing
+shaft_rules = [
+    {   "conditions": [
+            lambda x: "NO1" in x,
+            lambda x: "SHAFT" in x
+        ],
+        "action": 'Shaft1'
+    },
+    {   "conditions": [
+            lambda x: "NO1" in x,
+            lambda x: "Shaft" in x
+        ],
+        "action": 'Shaft1'
+    },
+    {   "conditions": [
+            lambda x: "NO2" in x,
+            lambda x: "Shaft" in x
+        ],
+        "action": 'Shaft2'
+    },
+    {   "conditions": [
+            lambda x: "NO2" in x,
+            lambda x: "SHAFT" in x
+        ],
+        "action": 'Shaft2'
+    },
+
+    {   "conditions": [
+            lambda x: "NO1" not in x,
+            lambda x: "NO2" not in x,
+            lambda x: "SHAFT" not in x
+        ],
+        "action": 'Shaft1'
+    },
+
+    {   "conditions": [
+            lambda x: "NO1" not in x,
+            lambda x: "NO2" not in x,
+            lambda x: "SHAFT" in x,
+            lambda x: "(P)" in x
+        ],
+        "action": 'Shaft2'
+    },
+
+    {   "conditions": [
+            lambda x: "NO1" not in x,
+            lambda x: "NO2" not in x,
+            lambda x: "SHAFT" in x,
+            lambda x: "(S)" in x
+        ],
+        "action": 'Shaft1'
+    },
+
+    {   "conditions": [
+            lambda x: "NO1" not in x,
+            lambda x: "NO2" not in x,
+            lambda x: "SHAFT" in x,
+            lambda x: "(S)" not in x,
+            lambda x: "(P)" not in x
+        ],
+        "action": 'Shaft1'
+    },
+]
+
+
+# ME# class post-processing
+ME_rules = [
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "(P)" not in x,
+            lambda x: "(S)" not in x,
+            lambda x: "GE" not in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "(P)" in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME2Flow'
+    },
+
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "(S)" in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+
+
+    {   "conditions": [
+            lambda x: "ME" not in x,
+            lambda x: "GE" not in x,
+            lambda x: "(P)" not in x,
+            lambda x: "(S)" not in x,
+            lambda x: "FLOW" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+    {   "conditions": [
+            lambda x: "ME" in x,
+            lambda x: "GE" not in x,
+            lambda x: "(P)" not in x,
+            lambda x: "(S)" not in x,
+            lambda x: "FLOW" not in x,
+            lambda x: "CONSUMPTION" in x,
+        ],
+        "action": 'ME1Flow'
+    },
+]
+
+# GEFlow rules
+GEFlow_rules = [
+        {   "conditions": [
+            lambda x: "NO" not in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+        ],
+        "action": 'GE1Flow'
+    },
+    {   "conditions": [ 
+            lambda x: "NO1" in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE1Flow'
+    },
+
+        {   "conditions": [ 
+            lambda x: "NO2" in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE2Flow'
+    },
+        {   "conditions": [ 
+            lambda x: "NO3" in x,
+            lambda x: "GE" in x,
+            lambda x: "MGO" in x,
+
+        ],
+        "action": 'GE3Flow'
+    },
+
+        {   "conditions": [ 
+            lambda x: "NO" not in x,
+            lambda x: "GE" in x,
+            lambda x: "CONSUMPTION" in x,
+
+        ],
+        "action": 'GE1Flow'
+    },
+]
\ No newline at end of file
diff --git a/end_to_end/preprocess.py b/production/end_to_end/preprocess.py
similarity index 97%
rename from end_to_end/preprocess.py
rename to production/end_to_end/preprocess.py
index a21f61a..d85120c 100644
--- a/end_to_end/preprocess.py
+++ b/production/end_to_end/preprocess.py
@@ -1,6 +1,6 @@
 # %%
 import re
-from replacement_dict import desc_replacement_dict, unit_replacement_dict
+from end_to_end.replacement_dict import desc_replacement_dict, unit_replacement_dict
 
 class Abbreviator:
 
diff --git a/end_to_end/replacement_dict.py b/production/end_to_end/replacement_dict.py
similarity index 100%
rename from end_to_end/replacement_dict.py
rename to production/end_to_end/replacement_dict.py
diff --git a/production/end_to_end/rule_based_correction.py b/production/end_to_end/rule_based_correction.py
new file mode 100644
index 0000000..1b74a0f
--- /dev/null
+++ b/production/end_to_end/rule_based_correction.py
@@ -0,0 +1,122 @@
+import os
+import re
+
+import pandas as pd
+
+from end_to_end.post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
+
+class Corrector():
+
+    def __init__(self, df):
+        # copy over the existing
+        df['edited_p_thing'] = df['p_thing'].copy()
+        self.df = df
+
+
+    def _process_boiler_data(self, boiler_data):
+        updated_dataframes = []
+        for _, group in boiler_data.groupby('ships_idx'):
+            group_copy = group.copy()
+
+            group_copy["tag_description"] = group_copy["tag_description"].fillna('')
+
+            contains_no1_cond = group_copy["tag_description"].str.contains("NO1")
+            contains_no2_cond = group_copy["tag_description"].str.contains("NO2")
+            contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
+            doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO")
+            contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
+            contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
+
+            group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
+            group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
+            group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
+
+            if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
+                max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
+                composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
+
+                if max_boiler_number > 3:
+                    max_boiler_number = 3
+
+                group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
+            else:
+                group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
+
+            updated_dataframes.append(group_copy)   # Collect updated group
+
+        # Step 2: Concatenate all updated groups
+        if (len(updated_dataframes) == 0):
+            return boiler_data
+        updated_boiler_data = pd.concat(updated_dataframes)
+        return updated_boiler_data
+
+    def _check_conditions(self, value, conditions):
+        # Check if a value satisfies all conditions
+        return all(condition(value) for condition in conditions)
+
+    def _apply_rules(self, description, thing, rules):
+    #Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
+        for rule in rules:
+            if self._check_conditions(description, rule["conditions"]): #Check that all conditions are met
+                return rule["action"]  #Execute the action and return the result
+        return thing  #Returns the value of the thing column if it doesn't match any of the rules
+
+    def run_correction(self):
+        final_dataframe = self.df.copy()
+        # Hwanggawi main function
+        #Get partial columns
+        TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
+
+        #Shaft
+        SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
+        SF_df_in_MDM = SF_df[(SF_df['MDM'])]
+
+        #ME
+        ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
+                      TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|
+                      TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
+        ME_df_in_MDM = ME_df[(ME_df['MDM'])]
+
+        #GE
+        GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
+                      TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|
+                      TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
+        GE_df_in_MDM = GE_df[(GE_df['MDM'])]
+
+        SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
+        GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
+        ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
+
+        # ShipBoiler class post-processing
+        mdm = final_dataframe[final_dataframe["MDM"]].copy()
+        boiler_data = mdm[mdm["thing"].str.contains("BOILER")].copy()
+
+        # blr_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
+        # boiler_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
+        # boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
+
+        # different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
+
+        # unique_ships_idxs = boiler_data["ships_idx"].unique()
+
+        boiler_data["edited_p_thing"] = boiler_data["p_thing"]
+        updated_boiler_data = self._process_boiler_data(boiler_data)
+        final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
+
+        result = SF_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
+        SF_df_in_MDM['edited_p_thing'] = result
+        final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
+
+        result = ME_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
+        ME_df_in_MDM['edited_p_thing'] = result
+        final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
+
+
+        result = GE_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
+        GE_df_in_MDM['edited_p_thing'] = result
+        final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
+
+        # override p_thing with edited_p_thing
+        final_dataframe['p_thing'] = final_dataframe['edited_p_thing'].copy()
+
+        return final_dataframe
\ No newline at end of file
diff --git a/end_to_end/run.py b/production/run.py
similarity index 82%
rename from end_to_end/run.py
rename to production/run.py
index df921ac..1474474 100644
--- a/end_to_end/run.py
+++ b/production/run.py
@@ -2,13 +2,16 @@
 import pandas as pd
 import os
 import glob
-from mapper import Mapper
-from preprocess import Abbreviator
-from deduplication import run_deduplication
+from end_to_end.mapper import Mapper
+from end_to_end.preprocess import Abbreviator
+from end_to_end.deduplication import run_deduplication
+from end_to_end.rule_based_correction import Corrector
+
 
 # global config
 BATCH_SIZE = 512
 SHIPS_LIST = [1000,1001,1003,1004]
+# SHIPS_LIST = [1000]
 
 # %%
 # START: we import the raw data csv and extract only a few ships from it to simulate incoming json
@@ -47,6 +50,12 @@ df_out = pd.DataFrame({
 # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
 df = pd.concat([df, df_out], axis=1)
 
+# %%
+###################################
+# run rule-based correction
+corrector = Corrector(df)
+df = corrector.run_correction()
+
 
 # %%
 ####################################
@@ -59,7 +68,7 @@ df = run_deduplication(
     test_df=df,
     train_df=train_df,
     batch_size=BATCH_SIZE,
-    threshold=0.85,
+    threshold=0.9,
     diagnostic=True)
 
 # %%
diff --git a/train/mapping_t5_final_delivery_model/.gitignore b/train/mapping_t5_final_delivery_model/.gitignore
new file mode 100644
index 0000000..2e7f3f7
--- /dev/null
+++ b/train/mapping_t5_final_delivery_model/.gitignore
@@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log/
diff --git a/train/mapping_t5_final_delivery_model/train.py b/train/mapping_t5_final_delivery_model/train.py
new file mode 100644
index 0000000..7b6227e
--- /dev/null
+++ b/train/mapping_t5_final_delivery_model/train.py
@@ -0,0 +1,196 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    T5TokenizerFast,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainer,
+    EarlyStoppingCallback,
+    Seq2SeqTrainingArguments
+)
+import evaluate
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# outputs a list of dictionaries
+def process_df_to_dict(df):
+    output_list = []
+    for _, row in df.iterrows():
+        desc = f"<DESC>{row['tag_description']}<DESC>"
+        unit = f"<UNIT>{row['unit']}<UNIT>"
+        element = {
+            'input' : f"{desc}{unit}",
+            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset():
+    # train 
+    data_path = "../../data_preprocess/exports/preprocessed_data.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+    train_df = train_df[train_df['MDM']].reset_index(drop=True)
+
+    # valid
+    data_path = "../../data_preprocess/exports/dataset/group_1/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
+    })
+    return combined_data
+
+
+# function to perform training for a given fold
+def train():
+    save_path = 'checkpoint'
+    split_datasets = create_split_dataset()
+
+    # prepare tokenizer
+
+    model_checkpoint = "t5-small"
+    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['input']
+        target = example['output']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            text_target=target, 
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns=split_datasets["train"].column_names,
+    )
+
+    # https://github.com/huggingface/transformers/pull/28414
+    # model_checkpoint = "google/t5-efficient-tiny"
+    # device_map set to auto to force it to load contiguous weights 
+    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
+    metric = evaluate.load("sacrebleu")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # In case the model returns more than the prediction logits
+        if isinstance(preds, tuple):
+            preds = preds[0]
+
+        decoded_preds = tokenizer.batch_decode(preds, 
+                                            skip_special_tokens=False)
+
+        # Replace -100s in the labels as we can't decode them
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels,
+                                                skip_special_tokens=False)
+
+        # Remove <PAD> tokens from decoded predictions and labels
+        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
+        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
+
+        # Some simple post-processing
+        # decoded_preds = [pred.strip() for pred in decoded_preds]
+        # decoded_labels = [[label.strip()] for label in decoded_labels]
+        # print(decoded_preds, decoded_labels)
+
+        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+        return {"bleu": result["score"]}
+
+
+    # Generation Config
+    # from transformers import GenerationConfig
+    gen_config = model.generation_config
+    gen_config.max_length = 64
+
+    # compile
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # Trainer
+
+    args = Seq2SeqTrainingArguments(
+        f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-3,
+        per_device_train_batch_size=128,
+        per_device_eval_batch_size=128,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=80,
+        predict_with_generate=True,
+        bf16=True,
+        push_to_hub=False,
+        generation_config=gen_config,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Seq2SeqTrainer(
+        model,
+        args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+train()
+