Feat: added post_processing based on rules

others: - added basic data analysis to get histograms of text differences - added new final delivery model
2024-12-18 13:43:56 +09:00 · 2024-12-18 13:43:56 +09:00 · c5760d127d
parent 481bcf88b7
commit c5760d127d
23 changed files with 1178 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-*.zip
+*.zip
 post_processor
--- a/analysis/categories/label_print.py
+++ b/analysis/categories/label_print.py
@ -7,6 +7,14 @@ full_df = pd.read_csv(data_path, skipinitialspace=True)
 mdm_list = sorted(list((set(full_df['pattern']))))
 # %%
 full_df
 # %%
 mask1 = full_df['thing'] == 'ME1TurboCharger1'
 mask2 = full_df['property'] == 'LOInletPress'
 mask = mask1 & mask2
 full_df[mask]
 # %%
 len(mdm_list)
 # %%
@ -16,3 +24,12 @@ tp_list = sorted(list(set(thing_property)))
 # %%
 len(tp_list)
 # %%
 data_path = '../../data_import/exports/raw_data.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 bad_df = df[~df['MDM']]
 # %%
 bad_df[bad_df['thing'] == '$UNMAPPED']
 # %%
--- a/analysis/data_properties/basic_eda.py
+++ b/analysis/data_properties/basic_eda.py
@ -9,5 +9,5 @@ df = pd.read_csv(data_path)
 df
 # %%
-set(df['signal_type'])
+len(set(df['ships_idx']))
 # %%
--- a/analysis/data_properties/ship_counts.py
+++ b/analysis/data_properties/ship_counts.py
@ -0,0 +1,71 @@
 # %%
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import os
 # note: we assume that you will execute from the directory of this code
 # check your current directory
 print("Current Working Directory:", os.getcwd())
 # %%
 # plt.rcParams.update({'font.size': 18})
 df = pd.read_csv('../../data_import/exports/raw_data.csv')
 total_counts = df['ships_idx'].value_counts().sort_index()
 mdm_true_counts = df[df['MDM']]['ships_idx'].value_counts().sort_index()
 summary_df = pd.DataFrame({
    'SD': total_counts,
    'PD': mdm_true_counts
 }).fillna(0)
 total_SD = summary_df['SD'].sum()
 total_PD = summary_df['PD'].sum()
 print(f"Total SD: {total_SD}")
 print(f"Total PD: {total_PD}")
 # %%
 plt.figure(figsize=(8, 6)) 
 fig, ax = plt.subplots(figsize=(8, 6))
 summary_df['SD'].plot(
    kind='bar',
    ax=ax,
    color='orange',
    alpha=0.5,
    label='Ship Domain',
    width=0.8)
 summary_df['PD'].plot(
    kind='bar',
    ax=ax,
    color='blue',
    alpha=0.7,
    label='Platform Domain',
    width=0.8)
 x_labels = ax.get_xticks()
 ax.set_xticks(np.arange(min(x_labels), max(x_labels) + 1, 10))
 ax.set_xticklabels(
    [int(label) for label in np.arange(min(x_labels), max(x_labels) + 1, 10)],
    rotation=0,
 )
 ax.grid(True)
 # plt.legend(prop={'size': 18})
 plt.legend()
 plt.ylabel('Counts')
 plt.xlabel('Ships')
 plt.savefig('count_statistics_of_each_ship.png')
 plt.show()
 # %%
--- a/analysis/split_analysis/bgkf_vs_gkf.py
+++ b/analysis/split_analysis/bgkf_vs_gkf.py
@ -0,0 +1,38 @@
 # %%
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import os
 # note: we assume that you will execute from the directory of this code
 # check your current directory
 print("Current Working Directory:", os.getcwd())
 # %%
 data_path = "../../data_preprocess/exports/combined_group_allocation.csv"
 df = pd.read_csv(data_path)
 # %%
 df
 # %%
 print('mean', df[df['Allocation'] == 'BGKF']['Comb_count'].mean())
 print('std', df[df['Allocation'] == 'BGKF']['Comb_count'].std())
 max = df[df['Allocation'] == 'BGKF']['Comb_count'].max()
 min = df[df['Allocation'] == 'BGKF']['Comb_count'].min()
 print('max', max)
 print('max', min)
 print('max - min', max - min)
 # %%
 print('mean', df[df['Allocation'] == 'GKF']['Comb_count'].mean())
 print('std', df[df['Allocation'] == 'GKF']['Comb_count'].std())
 max = df[df['Allocation'] == 'GKF']['Comb_count'].max()
 min = df[df['Allocation'] == 'GKF']['Comb_count'].min()
 print('max', max)
 print('max', min)
 print('max - min', max - min)
 # %%
--- a/analysis/string_levenshtein/between_ship_and_platform.py
+++ b/analysis/string_levenshtein/between_ship_and_platform.py
@ -0,0 +1,72 @@
 # %%
 import pandas as pd
 import Levenshtein
 import numpy as np
 import matplotlib.pyplot as plt
 from tqdm import tqdm
 # %%
 data_path = '../../data_import/exports/data_mapping_mdm.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 mdm_list = sorted(list((set(df['pattern']))))
 # %%
 df['thing_property'] = df['thing'] + df['property']
 # %%
 def compute_norm_leven(string1, string2):
    max_distance = max(len(string1), len(string2))
    leven_distance = Levenshtein.distance(string1, string2)
    norm_leven = leven_distance / max_distance
    return norm_leven
 # %%
 n = len(df)
 distance_array = np.zeros((n), dtype=float)
 desc_array = df['tag_description']
 thing_property_array = df['thing_property']
 # %%
 # compute normalized levenshtein distance
 for i in tqdm(range(n)):
    string1 = desc_array[i]
    string2 = thing_property_array[i]
    distance_array[i] = compute_norm_leven(string1, string2)
 # %%
 distance_array
 # %%
 plt.figure(figsize=(8, 6)) 
 plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
 plt.xlabel("Normalized Levenshtein Distance")
 plt.ylabel("Count")
 plt.tight_layout()
 plt.savefig("histogram.png", dpi=300)
 #
 # %%
 # summary statistics of computed levenshtein distance
 def summary_stats(arr):
    return {
        "Mean": np.mean(arr),
        "Median": np.median(arr),
        "Standard Deviation": np.std(arr),
        "Variance": np.var(arr),
        "Min": np.min(arr),
        "Max": np.max(arr),
        "Range": np.ptp(arr),
        "25th Percentile": np.percentile(arr, 25),
        "75th Percentile": np.percentile(arr, 75),
        "Sum": np.sum(arr),
    }
 stats = summary_stats(distance_array)
 for key, value in stats.items():
    print(f"{key}: {value}")
 # %%
--- a/analysis/string_levenshtein/within_same_class.py
+++ b/analysis/string_levenshtein/within_same_class.py
@ -0,0 +1,88 @@
 # %%
 import pandas as pd
 import Levenshtein
 import numpy as np
 import matplotlib.pyplot as plt
 from tqdm import tqdm
 # %%
 data_path = '../../data_import/exports/data_mapping_mdm.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 df['thing_property'] = df['thing'] + df['property']
 mdm_list = sorted(list((set(df['thing_property']))))
 # %%
 def compute_norm_leven(string1, string2):
    max_distance = max(len(string1), len(string2))
    leven_distance = Levenshtein.distance(string1, string2)
    norm_leven = leven_distance / max_distance
    return norm_leven
 def compute_avg_score(strings):
    n = len(strings)
    # if group only has 1 string, then it is fully similar to itself
    if n == 1:
        return 0
    # Create an empty matrix
    distance_matrix = np.zeros((n, n), dtype=float)
    # Fill only the upper triangular part
    for i in range(n):
        for j in range(i + 1, n):
            dist = compute_norm_leven(strings[i], strings[j])
            distance_matrix[i, j] = dist
    upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
    mean_distance = np.mean(upper_triangular_distances)
    return mean_distance
 # %%
 # we want to subset to each class
 n = len(mdm_list)
 score_list = np.zeros((n), dtype=float)
 for i in range(n):
    df_subset = df[df['thing_property'] == mdm_list[i]]
    strings = df_subset['tag_description'].to_numpy()
    score_list[i] = compute_avg_score(strings)
 # %%
 score_list
 # %%
 # plt.hist(score_list, bins=50)
 plt.figure(figsize=(8, 6)) 
 plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
 plt.xlabel("Normalized Levenshtein Distance")
 plt.ylabel("Platform Domain Class Count")
 plt.tight_layout()
 plt.savefig("histogram.png", dpi=300)
 # %%
 # summary statistics of computed levenshtein distance
 def summary_stats(arr):
    return {
        "Mean": np.mean(arr),
        "Median": np.median(arr),
        "Standard Deviation": np.std(arr),
        "Variance": np.var(arr),
        "Min": np.min(arr),
        "Max": np.max(arr),
        "Range": np.ptp(arr),
        "25th Percentile": np.percentile(arr, 25),
        "75th Percentile": np.percentile(arr, 75),
        "Sum": np.sum(arr),
    }
 stats = summary_stats(score_list)
 for key, value in stats.items():
    print(f"{key}: {value}")
 # %%
--- a/data_preprocess/split_data.py
+++ b/data_preprocess/split_data.py
@ -181,6 +181,7 @@ remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
 # Allocate remaining ships to the groups
 while len(remaining_ships) > 0:
    # re-compute the counts for each group
    group_comb_counts = []
    for g in range(num_groups):
        group_ships = groups[g]
@ -190,18 +191,21 @@ while len(remaining_ships) > 0:
    group_comb_counts.sort(key=lambda x: x[1])
    # reset the remaining_group list
    remaining_group = []
    # g is the identifier for the group
    for g, _ in group_comb_counts:
        if len(remaining_ships) == 0:
            break
        # compute for each group, the selected ship, and the combined count increase
        if group_comb_counts.index((g, _)) == 0:
            selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
        else:
            max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
            selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
        # if the combined increase is 0, then we process it in a special manner
        if comb_increase == 0:
            remaining_group.append(g)
        else:
--- a/interpretation/fold_analysis_t5.py
+++ b/interpretation/fold_analysis_t5.py
@ -2,7 +2,7 @@
 # %%
 import pandas as pd
 import os
-from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
+from inference import Inference, Embedder_t5_encoder
 import numpy as np
 from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
--- a/post_process/rule_based_correction/.gitignore
+++ b/post_process/rule_based_correction/.gitignore
@ -0,0 +1,3 @@
 *.csv
 fold_*
 __pycache__
--- a/post_process/rule_based_correction/post_processing_rules.py
+++ b/post_process/rule_based_correction/post_processing_rules.py
@ -0,0 +1,154 @@
 # Shaft# class post-processing
 shaft_rules = [
    {   "conditions": [
            lambda x: "NO.1" in x,
            lambda x: "SHAFT" in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO.1" in x,
            lambda x: "Shaft" in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO.2" in x,
            lambda x: "Shaft" in x
        ],
        "action": 'Shaft2'
    },
    {   "conditions": [
            lambda x: "NO.2" in x,
            lambda x: "SHAFT" in x
        ],
        "action": 'Shaft2'
    },
    {   "conditions": [
            lambda x: "NO.1" not in x,
            lambda x: "NO.2" not in x,
            lambda x: "SHAFT" not in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO.1" not in x,
            lambda x: "NO.2" not in x,
            lambda x: "SHAFT" in x,
            lambda x: "(P)" in x
        ],
        "action": 'Shaft2'
    },
    {   "conditions": [
            lambda x: "NO.1" not in x,
            lambda x: "NO.2" not in x,
            lambda x: "SHAFT" in x,
            lambda x: "(S)" in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO.1" not in x,
            lambda x: "NO.2" not in x,
            lambda x: "SHAFT" in x,
            lambda x: "(S)" not in x,
            lambda x: "(P)" not in x
        ],
        "action": 'Shaft1'
    },
 ]
 # ME# class post-processing
 ME_rules = [
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "(P)" not in x,
            lambda x: "(S)" not in x,
            lambda x: "GE" not in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME1Flow'
    },
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "(P)" in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME2Flow'
    },
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "(S)" in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME1Flow'
    },
    {   "conditions": [
            lambda x: "ME" not in x,
            lambda x: "GE" not in x,
            lambda x: "(P)" not in x,
            lambda x: "(S)" not in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME1Flow'
    },
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "GE" not in x,
            lambda x: "(P)" not in x,
            lambda x: "(S)" not in x,
            lambda x: "FLOW" not in x,
            lambda x: "CONSUMPTION" in x,
        ],
        "action": 'ME1Flow'
    },
 ]
 # GEFlow rules
 GEFlow_rules = [
        {   "conditions": [lambda x: "NO." not in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE1Flow'
    },
    {   "conditions": [ lambda x: "NO.1" in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE1Flow'
    },
        {   "conditions": [ lambda x: "NO.2" in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE2Flow'
    },
        {   "conditions": [ lambda x: "NO.3" in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE3Flow'
    },
        {   "conditions": [ lambda x: "NO." not in x,
            lambda x: "GE" in x,
            lambda x: "CONSUMPTION" in x,
        ],
        "action": 'GE1Flow'
    },
 ]
--- a/post_process/rule_based_correction/post_processor_gawi.py
+++ b/post_process/rule_based_correction/post_processor_gawi.py
@ -0,0 +1,233 @@
 # %%
 import os
 import re
 import pandas as pd
 from sklearn.metrics import classification_report
 from post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
 # %%
 # Function to print classification metrics
 def print_classification_metrics(df):
    print("Results before post-processing")
    # print(classification_report(df["thing"], df["p_thing"].fillna("")))
    report = classification_report(df["thing"], df["p_thing"].fillna(""), output_dict=True)
    # Extracting the weighted average values for precision, recall, and F1-score
    precision_avg = report['weighted avg']['precision']
    recall_avg = report['weighted avg']['recall']
    f1_avg = report['weighted avg']['f1-score']
    # Print the averages
    print("Average Precision:", precision_avg)
    print("Average Recall:", recall_avg)
    print("Average F1-Score:", f1_avg)
    print("**************")
    print("Results after post-processing")
    # print(classification_report(df["thing"], df["edited_p_thing"].fillna("")))
    report = classification_report(df["thing"], df["edited_p_thing"].fillna(""), output_dict=True)
    # Extracting the weighted average values for precision, recall, and F1-score
    precision_avg = report['weighted avg']['precision']
    recall_avg = report['weighted avg']['recall']
    f1_avg = report['weighted avg']['f1-score']
    # Print the averages
    print("Average Precision:", precision_avg)
    print("Average Recall:", recall_avg)
    print("Average F1-Score:", f1_avg)
 def read_data():
    BASE_FOLDER = "fold"
    BASE_FILE_NAME = "result_group"
    NUM_FOLDS = 5
    #  List for storing all DataFrames
    dataframes = []
    # Iterate over all fold
    for i in range(NUM_FOLDS):
        fold_folder = f"{BASE_FOLDER}_{i+1}"
        desired_file_name = f"{BASE_FILE_NAME}_{i+1}.csv"
        for file_name in os.listdir(fold_folder):
            if file_name==desired_file_name:
                file_path = os.path.join(fold_folder, file_name)
                df = pd.read_csv(file_path, index_col=0)
                dataframes.append(df)
    # Combine all DataFrames into one
    final_dataframe = pd.concat(dataframes)
    final_dataframe = final_dataframe[final_dataframe["MDM"]].reset_index(drop=True)
    # assign a copy
    final_dataframe["edited_p_thing"] = final_dataframe["p_thing"]
    return final_dataframe
 def update_shipboiler_p_thing(df, tag_column='tag_description', thing_pred_column='edited_p_thing'):
    """
    Update the 'thing' column in the DataFrame based on rules applied to 'tag_description' column.
    Parameters:
    - df: DataFrame to apply the function on.
    - tag_column: Column name containing descriptions to base the logic on.
    - thing_pred_column: Column to be updated with new values based on conditions.
    Returns:
    - DataFrame with updated 'thing' column.
    """
    # Fill NaN values in tag column to avoid errors with ~ operator
    df[tag_column] = df[tag_column].fillna('')
    # Apply rules for "NO.1", "NO.2", and "AUXILIARY" in tag_column
    df.loc[df[tag_column].str.contains("NO.1", case=False), thing_pred_column] = "ShipBoiler1"
    df.loc[df[tag_column].str.contains("NO.2", case=False), thing_pred_column] = "ShipBoiler2"
    df.loc[(df[tag_column].str.contains("AUXILIARY", case=False)) & (~df[tag_column].str.contains("NO.", case=False)), thing_pred_column] = "ShipBoiler1"
    # Determine the highest number for "thing" in rows without "COMPOSITE" and assign to "COMPOSITE" rows
    max_boiler_number = df.loc[~df[tag_column].str.contains("COMPOSITE", case=False) & df[thing_pred_column].str.contains("ShipBoiler", na=False), thing_pred_column] \
                           .str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
    composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
    # Apply composite_boiler to rows containing "COMPOSITE"
    df.loc[df[tag_column].str.contains("COMPOSITE", case=False), thing_pred_column] = composite_boiler
    return df
 def process_boiler_data(boiler_data):
    updated_dataframes = []
    for _, group in boiler_data.groupby('ships_idx'):
        group_copy = group.copy()
        group_copy["tag_description"] = group_copy["tag_description"].fillna('')
        contains_no1_cond = group_copy["tag_description"].str.contains("NO.1")
        contains_no2_cond = group_copy["tag_description"].str.contains("NO.2")
        contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
        doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO.")
        contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
        contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
        group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
        group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
        group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
        if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
            max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
            composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
            if max_boiler_number > 3:
                max_boiler_number = 3
            group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
        else:
            group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
        updated_dataframes.append(group_copy)   # Collect updated group
        # Step 2: Concatenate all updated groups
    updated_boiler_data = pd.concat(updated_dataframes)
    return updated_boiler_data
 def check_conditions(value,conditions):
 # Check if a value satisfies all conditions
    return all(condition(value) for condition in conditions)
 def apply_rules(description, thing, rules):
 #Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
    for rule in rules:
        if check_conditions(description, rule["conditions"]): #Check that all conditions are met
            return rule["action"]  #Execute the action and return the result
    return thing  #Returns the value of the thing column if it doesn't match any of the rules
 # %%
 # if __name__ == "__main__":
 # %%
 # Read and preprocess data
 final_dataframe = read_data()
 # %%
 final_dataframe
 # %%
 # Hwanggawi main function
 #Get partial columns
 TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
 #Shaft
 SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
 SF_df_in_MDM = SF_df[(SF_df['MDM'])]
 #ME
 ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
                TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
 ME_df_in_MDM = ME_df[(ME_df['MDM'])]
 #GE
 GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
                TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
 GE_df_in_MDM = GE_df[(GE_df['MDM'])]
 SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
 GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
 ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
 # ShipBoiler class post-processing
 mdm = final_dataframe[final_dataframe["MDM"]].copy()
 boiler_data = mdm[mdm["thing"].str.contains("Boiler")].copy()
 blr_cond = boiler_data["tag_description"].str.lower().str.contains("blr")
 boiler_cond = boiler_data["tag_description"].str.lower().str.contains("boiler")
 boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
 different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
 unique_ships_idxs = boiler_data["ships_idx"].unique()
 boiler_data["edited_p_thing"] = boiler_data["p_thing"]
 updated_boiler_data = process_boiler_data(boiler_data)
 # Save updated data back to the original DataFrame
 final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
 result = SF_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
 SF_df_in_MDM['edited_p_thing'] = result
 # Save updated data back to the original DataFrame
 final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
 result = ME_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
 ME_df_in_MDM['edited_p_thing'] = result
 final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
 result = GE_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
 GE_df_in_MDM['edited_p_thing'] = result
 final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
 # output final dataframe
 final_dataframe.to_csv("post_processed_df.csv", index=False)
 print("Saved output to post_processed_df.csv")
 # %%
 # print results
 print("ShipBoiler post-processing results")
 print_classification_metrics(updated_boiler_data)
 print("----------------------------")
 print("ShipBoiler post-processing results")
 print_classification_metrics(ME_df_in_MDM)
 print("----------------------------")
 print("ShipBoiler post-processing results")
 print_classification_metrics(GE_df_in_MDM)
 print("----------------------------")
 # %%
--- a/production/.gitignore
+++ b/production/.gitignore
--- a/end_to_end/.README.md
+++ b/end_to_end/.README.md
--- a/production/end_to_end/deduplication.py
+++ b/production/end_to_end/deduplication.py
@ -265,7 +265,7 @@ def run_deduplication(
        print('generate train embeddings')
        train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
        tensor = train_embedder.make_embedding(checkpoint_path)
-        torch.save(tensor, file_path, weights_only=True)
+        torch.save(tensor, file_path)
        print("Tensor saved to file.")
    train_embeds = tensor
--- a/production/end_to_end/mapper.py
+++ b/production/end_to_end/mapper.py
--- a/production/end_to_end/post_processing_rules.py
+++ b/production/end_to_end/post_processing_rules.py
@ -0,0 +1,158 @@
 # Shaft# class post-processing
 shaft_rules = [
    {   "conditions": [
            lambda x: "NO1" in x,
            lambda x: "SHAFT" in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO1" in x,
            lambda x: "Shaft" in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO2" in x,
            lambda x: "Shaft" in x
        ],
        "action": 'Shaft2'
    },
    {   "conditions": [
            lambda x: "NO2" in x,
            lambda x: "SHAFT" in x
        ],
        "action": 'Shaft2'
    },
    {   "conditions": [
            lambda x: "NO1" not in x,
            lambda x: "NO2" not in x,
            lambda x: "SHAFT" not in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO1" not in x,
            lambda x: "NO2" not in x,
            lambda x: "SHAFT" in x,
            lambda x: "(P)" in x
        ],
        "action": 'Shaft2'
    },
    {   "conditions": [
            lambda x: "NO1" not in x,
            lambda x: "NO2" not in x,
            lambda x: "SHAFT" in x,
            lambda x: "(S)" in x
        ],
        "action": 'Shaft1'
    },
    {   "conditions": [
            lambda x: "NO1" not in x,
            lambda x: "NO2" not in x,
            lambda x: "SHAFT" in x,
            lambda x: "(S)" not in x,
            lambda x: "(P)" not in x
        ],
        "action": 'Shaft1'
    },
 ]
 # ME# class post-processing
 ME_rules = [
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "(P)" not in x,
            lambda x: "(S)" not in x,
            lambda x: "GE" not in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME1Flow'
    },
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "(P)" in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME2Flow'
    },
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "(S)" in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME1Flow'
    },
    {   "conditions": [
            lambda x: "ME" not in x,
            lambda x: "GE" not in x,
            lambda x: "(P)" not in x,
            lambda x: "(S)" not in x,
            lambda x: "FLOW" in x,
        ],
        "action": 'ME1Flow'
    },
    {   "conditions": [
            lambda x: "ME" in x,
            lambda x: "GE" not in x,
            lambda x: "(P)" not in x,
            lambda x: "(S)" not in x,
            lambda x: "FLOW" not in x,
            lambda x: "CONSUMPTION" in x,
        ],
        "action": 'ME1Flow'
    },
 ]
 # GEFlow rules
 GEFlow_rules = [
        {   "conditions": [
            lambda x: "NO" not in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE1Flow'
    },
    {   "conditions": [ 
            lambda x: "NO1" in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE1Flow'
    },
        {   "conditions": [ 
            lambda x: "NO2" in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE2Flow'
    },
        {   "conditions": [ 
            lambda x: "NO3" in x,
            lambda x: "GE" in x,
            lambda x: "MGO" in x,
        ],
        "action": 'GE3Flow'
    },
        {   "conditions": [ 
            lambda x: "NO" not in x,
            lambda x: "GE" in x,
            lambda x: "CONSUMPTION" in x,
        ],
        "action": 'GE1Flow'
    },
 ]
--- a/production/end_to_end/preprocess.py
+++ b/production/end_to_end/preprocess.py
@ -1,6 +1,6 @@
 # %%
 import re
-from replacement_dict import desc_replacement_dict, unit_replacement_dict
+from end_to_end.replacement_dict import desc_replacement_dict, unit_replacement_dict
 class Abbreviator:
--- a/production/end_to_end/replacement_dict.py
+++ b/production/end_to_end/replacement_dict.py
--- a/production/end_to_end/rule_based_correction.py
+++ b/production/end_to_end/rule_based_correction.py
@ -0,0 +1,122 @@
 import os
 import re
 import pandas as pd
 from end_to_end.post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
 class Corrector():
    def __init__(self, df):
        # copy over the existing
        df['edited_p_thing'] = df['p_thing'].copy()
        self.df = df
    def _process_boiler_data(self, boiler_data):
        updated_dataframes = []
        for _, group in boiler_data.groupby('ships_idx'):
            group_copy = group.copy()
            group_copy["tag_description"] = group_copy["tag_description"].fillna('')
            contains_no1_cond = group_copy["tag_description"].str.contains("NO1")
            contains_no2_cond = group_copy["tag_description"].str.contains("NO2")
            contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
            doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO")
            contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
            contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
            group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
            group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
            group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
            if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
                max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
                composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
                if max_boiler_number > 3:
                    max_boiler_number = 3
                group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
            else:
                group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
            updated_dataframes.append(group_copy)   # Collect updated group
        # Step 2: Concatenate all updated groups
        if (len(updated_dataframes) == 0):
            return boiler_data
        updated_boiler_data = pd.concat(updated_dataframes)
        return updated_boiler_data
    def _check_conditions(self, value, conditions):
        # Check if a value satisfies all conditions
        return all(condition(value) for condition in conditions)
    def _apply_rules(self, description, thing, rules):
    #Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
        for rule in rules:
            if self._check_conditions(description, rule["conditions"]): #Check that all conditions are met
                return rule["action"]  #Execute the action and return the result
        return thing  #Returns the value of the thing column if it doesn't match any of the rules
    def run_correction(self):
        final_dataframe = self.df.copy()
        # Hwanggawi main function
        #Get partial columns
        TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
        #Shaft
        SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
        SF_df_in_MDM = SF_df[(SF_df['MDM'])]
        #ME
        ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
                      TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|
                      TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
        ME_df_in_MDM = ME_df[(ME_df['MDM'])]
        #GE
        GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
                      TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|
                      TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
        GE_df_in_MDM = GE_df[(GE_df['MDM'])]
        SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
        GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
        ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
        # ShipBoiler class post-processing
        mdm = final_dataframe[final_dataframe["MDM"]].copy()
        boiler_data = mdm[mdm["thing"].str.contains("BOILER")].copy()
        # blr_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
        # boiler_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
        # boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
        # different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
        # unique_ships_idxs = boiler_data["ships_idx"].unique()
        boiler_data["edited_p_thing"] = boiler_data["p_thing"]
        updated_boiler_data = self._process_boiler_data(boiler_data)
        final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
        result = SF_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
        SF_df_in_MDM['edited_p_thing'] = result
        final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
        result = ME_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
        ME_df_in_MDM['edited_p_thing'] = result
        final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
        result = GE_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
        GE_df_in_MDM['edited_p_thing'] = result
        final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
        # override p_thing with edited_p_thing
        final_dataframe['p_thing'] = final_dataframe['edited_p_thing'].copy()
        return final_dataframe
--- a/production/run.py
+++ b/production/run.py
@ -2,13 +2,16 @@
 import pandas as pd
 import os
 import glob
-from mapper import Mapper
+from end_to_end.mapper import Mapper
-from preprocess import Abbreviator
+from end_to_end.preprocess import Abbreviator
-from deduplication import run_deduplication
+from end_to_end.deduplication import run_deduplication
 from end_to_end.rule_based_correction import Corrector
 # global config
 BATCH_SIZE = 512
 SHIPS_LIST = [1000,1001,1003,1004]
 # SHIPS_LIST = [1000]
 # %%
 # START: we import the raw data csv and extract only a few ships from it to simulate incoming json
@ -47,6 +50,12 @@ df_out = pd.DataFrame({
 # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
 df = pd.concat([df, df_out], axis=1)
 # %%
 ###################################
 # run rule-based correction
 corrector = Corrector(df)
 df = corrector.run_correction()
 # %%
 ####################################
@ -59,7 +68,7 @@ df = run_deduplication(
    test_df=df,
    train_df=train_df,
    batch_size=BATCH_SIZE,
-    threshold=0.85,
+    threshold=0.9,
    diagnostic=True)
 # %%
--- a/train/mapping_t5_final_delivery_model/.gitignore
+++ b/train/mapping_t5_final_delivery_model/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log/
--- a/train/mapping_t5_final_delivery_model/train.py
+++ b/train/mapping_t5_final_delivery_model/train.py
@ -0,0 +1,196 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from transformers import (
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    Seq2SeqTrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # outputs a list of dictionaries
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = f"<DESC>{row['tag_description']}<DESC>"
        unit = f"<UNIT>{row['unit']}<UNIT>"
        element = {
            'input' : f"{desc}{unit}",
            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
        }
        output_list.append(element)
    return output_list
 def create_split_dataset():
    # train 
    data_path = "../../data_preprocess/exports/preprocessed_data.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    train_df = train_df[train_df['MDM']].reset_index(drop=True)
    # valid
    data_path = "../../data_preprocess/exports/dataset/group_1/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
    })
    return combined_data
 # function to perform training for a given fold
 def train():
    save_path = 'checkpoint'
    split_datasets = create_split_dataset()
    # prepare tokenizer
    model_checkpoint = "t5-small"
    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['input']
        target = example['output']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            text_target=target, 
            max_length=max_length,
            truncation=True,
            padding=True
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns=split_datasets["train"].column_names,
    )
    # https://github.com/huggingface/transformers/pull/28414
    # model_checkpoint = "google/t5-efficient-tiny"
    # device_map set to auto to force it to load contiguous weights 
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    metric = evaluate.load("sacrebleu")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        # In case the model returns more than the prediction logits
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, 
                                            skip_special_tokens=False)
        # Replace -100s in the labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels,
                                                skip_special_tokens=False)
        # Remove <PAD> tokens from decoded predictions and labels
        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
        # Some simple post-processing
        # decoded_preds = [pred.strip() for pred in decoded_preds]
        # decoded_labels = [[label.strip()] for label in decoded_labels]
        # print(decoded_preds, decoded_labels)
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}
    # Generation Config
    # from transformers import GenerationConfig
    gen_config = model.generation_config
    gen_config.max_length = 64
    # compile
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # Trainer
    args = Seq2SeqTrainingArguments(
        f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
        generation_config=gen_config,
        remove_unused_columns=False,
    )
    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 train()