From c5760d127de6cac0f92713476f9f13d1ab6083d9 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Wed, 18 Dec 2024 13:43:56 +0900 Subject: [PATCH] Feat: added post_processing based on rules others: - added basic data analysis to get histograms of text differences - added new final delivery model --- .gitignore | 3 +- analysis/categories/label_print.py | 17 ++ .../basic_eda.py | 2 +- analysis/data_properties/ship_counts.py | 71 ++++++ analysis/split_analysis/bgkf_vs_gkf.py | 38 +++ .../between_ship_and_platform.py | 72 ++++++ .../string_levenshtein/within_same_class.py | 88 +++++++ data_preprocess/split_data.py | 6 +- interpretation/fold_analysis_t5.py | 2 +- post_process/rule_based_correction/.gitignore | 3 + .../post_processing_rules.py | 154 ++++++++++++ .../post_processor_gawi.py | 233 ++++++++++++++++++ {end_to_end => production}/.gitignore | 0 end_to_end/.README.md => production/README.md | 0 .../end_to_end}/deduplication.py | 2 +- .../end_to_end}/mapper.py | 0 .../end_to_end/post_processing_rules.py | 158 ++++++++++++ .../end_to_end}/preprocess.py | 2 +- .../end_to_end}/replacement_dict.py | 0 .../end_to_end/rule_based_correction.py | 122 +++++++++ {end_to_end => production}/run.py | 17 +- .../.gitignore | 2 + .../mapping_t5_final_delivery_model/train.py | 196 +++++++++++++++ 23 files changed, 1178 insertions(+), 10 deletions(-) rename analysis/{ship_data_list => data_properties}/basic_eda.py (83%) create mode 100644 analysis/data_properties/ship_counts.py create mode 100644 analysis/split_analysis/bgkf_vs_gkf.py create mode 100644 analysis/string_levenshtein/between_ship_and_platform.py create mode 100644 analysis/string_levenshtein/within_same_class.py create mode 100644 post_process/rule_based_correction/.gitignore create mode 100644 post_process/rule_based_correction/post_processing_rules.py create mode 100644 post_process/rule_based_correction/post_processor_gawi.py rename {end_to_end => production}/.gitignore (100%) rename end_to_end/.README.md => production/README.md (100%) rename {end_to_end => production/end_to_end}/deduplication.py (99%) rename {end_to_end => production/end_to_end}/mapper.py (100%) create mode 100644 production/end_to_end/post_processing_rules.py rename {end_to_end => production/end_to_end}/preprocess.py (97%) rename {end_to_end => production/end_to_end}/replacement_dict.py (100%) create mode 100644 production/end_to_end/rule_based_correction.py rename {end_to_end => production}/run.py (82%) create mode 100644 train/mapping_t5_final_delivery_model/.gitignore create mode 100644 train/mapping_t5_final_delivery_model/train.py diff --git a/.gitignore b/.gitignore index 6f66c74..6282acb 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.zip \ No newline at end of file +*.zip +post_processor \ No newline at end of file diff --git a/analysis/categories/label_print.py b/analysis/categories/label_print.py index a69711c..29de0ec 100644 --- a/analysis/categories/label_print.py +++ b/analysis/categories/label_print.py @@ -7,6 +7,14 @@ full_df = pd.read_csv(data_path, skipinitialspace=True) mdm_list = sorted(list((set(full_df['pattern'])))) +# %% +full_df + +# %% +mask1 = full_df['thing'] == 'ME1TurboCharger1' +mask2 = full_df['property'] == 'LOInletPress' +mask = mask1 & mask2 +full_df[mask] # %% len(mdm_list) # %% @@ -16,3 +24,12 @@ tp_list = sorted(list(set(thing_property))) # %% len(tp_list) # %% +data_path = '../../data_import/exports/raw_data.csv' +df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +bad_df = df[~df['MDM']] +# %% +bad_df[bad_df['thing'] == '$UNMAPPED'] + +# %% diff --git a/analysis/ship_data_list/basic_eda.py b/analysis/data_properties/basic_eda.py similarity index 83% rename from analysis/ship_data_list/basic_eda.py rename to analysis/data_properties/basic_eda.py index daa4224..3b0bf98 100644 --- a/analysis/ship_data_list/basic_eda.py +++ b/analysis/data_properties/basic_eda.py @@ -9,5 +9,5 @@ df = pd.read_csv(data_path) df # %% -set(df['signal_type']) +len(set(df['ships_idx'])) # %% diff --git a/analysis/data_properties/ship_counts.py b/analysis/data_properties/ship_counts.py new file mode 100644 index 0000000..6f46042 --- /dev/null +++ b/analysis/data_properties/ship_counts.py @@ -0,0 +1,71 @@ +# %% +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import os + +# note: we assume that you will execute from the directory of this code +# check your current directory +print("Current Working Directory:", os.getcwd()) + + +# %% +# plt.rcParams.update({'font.size': 18}) + +df = pd.read_csv('../../data_import/exports/raw_data.csv') +total_counts = df['ships_idx'].value_counts().sort_index() + +mdm_true_counts = df[df['MDM']]['ships_idx'].value_counts().sort_index() + +summary_df = pd.DataFrame({ + 'SD': total_counts, + 'PD': mdm_true_counts +}).fillna(0) + +total_SD = summary_df['SD'].sum() +total_PD = summary_df['PD'].sum() + +print(f"Total SD: {total_SD}") +print(f"Total PD: {total_PD}") + +# %% + +plt.figure(figsize=(8, 6)) +fig, ax = plt.subplots(figsize=(8, 6)) + +summary_df['SD'].plot( + kind='bar', + ax=ax, + color='orange', + alpha=0.5, + label='Ship Domain', + width=0.8) + +summary_df['PD'].plot( + kind='bar', + ax=ax, + color='blue', + alpha=0.7, + label='Platform Domain', + width=0.8) + +x_labels = ax.get_xticks() +ax.set_xticks(np.arange(min(x_labels), max(x_labels) + 1, 10)) +ax.set_xticklabels( + [int(label) for label in np.arange(min(x_labels), max(x_labels) + 1, 10)], + rotation=0, +) + +ax.grid(True) + +# plt.legend(prop={'size': 18}) +plt.legend() +plt.ylabel('Counts') +plt.xlabel('Ships') + +plt.savefig('count_statistics_of_each_ship.png') + +plt.show() + + +# %% diff --git a/analysis/split_analysis/bgkf_vs_gkf.py b/analysis/split_analysis/bgkf_vs_gkf.py new file mode 100644 index 0000000..692e1be --- /dev/null +++ b/analysis/split_analysis/bgkf_vs_gkf.py @@ -0,0 +1,38 @@ +# %% +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import os + +# note: we assume that you will execute from the directory of this code +# check your current directory +print("Current Working Directory:", os.getcwd()) + + + +# %% +data_path = "../../data_preprocess/exports/combined_group_allocation.csv" +df = pd.read_csv(data_path) + +# %% +df +# %% +print('mean', df[df['Allocation'] == 'BGKF']['Comb_count'].mean()) +print('std', df[df['Allocation'] == 'BGKF']['Comb_count'].std()) +max = df[df['Allocation'] == 'BGKF']['Comb_count'].max() +min = df[df['Allocation'] == 'BGKF']['Comb_count'].min() +print('max', max) +print('max', min) +print('max - min', max - min) + +# %% +print('mean', df[df['Allocation'] == 'GKF']['Comb_count'].mean()) +print('std', df[df['Allocation'] == 'GKF']['Comb_count'].std()) +max = df[df['Allocation'] == 'GKF']['Comb_count'].max() +min = df[df['Allocation'] == 'GKF']['Comb_count'].min() +print('max', max) +print('max', min) +print('max - min', max - min) + + +# %% diff --git a/analysis/string_levenshtein/between_ship_and_platform.py b/analysis/string_levenshtein/between_ship_and_platform.py new file mode 100644 index 0000000..02c796f --- /dev/null +++ b/analysis/string_levenshtein/between_ship_and_platform.py @@ -0,0 +1,72 @@ +# %% +import pandas as pd +import Levenshtein +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm + + +# %% +data_path = '../../data_import/exports/data_mapping_mdm.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(df['pattern'])))) + + +# %% +df['thing_property'] = df['thing'] + df['property'] + +# %% +def compute_norm_leven(string1, string2): + max_distance = max(len(string1), len(string2)) + leven_distance = Levenshtein.distance(string1, string2) + norm_leven = leven_distance / max_distance + return norm_leven + +# %% +n = len(df) +distance_array = np.zeros((n), dtype=float) + +desc_array = df['tag_description'] +thing_property_array = df['thing_property'] + +# %% +# compute normalized levenshtein distance +for i in tqdm(range(n)): + string1 = desc_array[i] + string2 = thing_property_array[i] + distance_array[i] = compute_norm_leven(string1, string2) + +# %% +distance_array + + +# %% +plt.figure(figsize=(8, 6)) +plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7) +plt.xlabel("Normalized Levenshtein Distance") +plt.ylabel("Count") +plt.tight_layout() +plt.savefig("histogram.png", dpi=300) +# +# %% +# summary statistics of computed levenshtein distance +def summary_stats(arr): + return { + "Mean": np.mean(arr), + "Median": np.median(arr), + "Standard Deviation": np.std(arr), + "Variance": np.var(arr), + "Min": np.min(arr), + "Max": np.max(arr), + "Range": np.ptp(arr), + "25th Percentile": np.percentile(arr, 25), + "75th Percentile": np.percentile(arr, 75), + "Sum": np.sum(arr), + } + +stats = summary_stats(distance_array) + +for key, value in stats.items(): + print(f"{key}: {value}") + +# %% diff --git a/analysis/string_levenshtein/within_same_class.py b/analysis/string_levenshtein/within_same_class.py new file mode 100644 index 0000000..eaf51d6 --- /dev/null +++ b/analysis/string_levenshtein/within_same_class.py @@ -0,0 +1,88 @@ +# %% +import pandas as pd +import Levenshtein +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm + + +# %% +data_path = '../../data_import/exports/data_mapping_mdm.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +df['thing_property'] = df['thing'] + df['property'] +mdm_list = sorted(list((set(df['thing_property'])))) + + +# %% +def compute_norm_leven(string1, string2): + max_distance = max(len(string1), len(string2)) + leven_distance = Levenshtein.distance(string1, string2) + norm_leven = leven_distance / max_distance + return norm_leven + +def compute_avg_score(strings): + n = len(strings) + + # if group only has 1 string, then it is fully similar to itself + if n == 1: + return 0 + + # Create an empty matrix + distance_matrix = np.zeros((n, n), dtype=float) + + # Fill only the upper triangular part + for i in range(n): + for j in range(i + 1, n): + dist = compute_norm_leven(strings[i], strings[j]) + distance_matrix[i, j] = dist + + upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)] + mean_distance = np.mean(upper_triangular_distances) + return mean_distance + + +# %% +# we want to subset to each class +n = len(mdm_list) +score_list = np.zeros((n), dtype=float) + +for i in range(n): + df_subset = df[df['thing_property'] == mdm_list[i]] + strings = df_subset['tag_description'].to_numpy() + score_list[i] = compute_avg_score(strings) + + +# %% +score_list + + +# %% +# plt.hist(score_list, bins=50) +plt.figure(figsize=(8, 6)) +plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7) +plt.xlabel("Normalized Levenshtein Distance") +plt.ylabel("Platform Domain Class Count") +plt.tight_layout() +plt.savefig("histogram.png", dpi=300) +# %% +# summary statistics of computed levenshtein distance +def summary_stats(arr): + return { + "Mean": np.mean(arr), + "Median": np.median(arr), + "Standard Deviation": np.std(arr), + "Variance": np.var(arr), + "Min": np.min(arr), + "Max": np.max(arr), + "Range": np.ptp(arr), + "25th Percentile": np.percentile(arr, 25), + "75th Percentile": np.percentile(arr, 75), + "Sum": np.sum(arr), + } + +stats = summary_stats(score_list) + +for key, value in stats.items(): + print(f"{key}: {value}") + +# %% diff --git a/data_preprocess/split_data.py b/data_preprocess/split_data.py index 50576e7..32d2326 100644 --- a/data_preprocess/split_data.py +++ b/data_preprocess/split_data.py @@ -181,6 +181,7 @@ remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values # Allocate remaining ships to the groups while len(remaining_ships) > 0: + # re-compute the counts for each group group_comb_counts = [] for g in range(num_groups): group_ships = groups[g] @@ -190,18 +191,21 @@ while len(remaining_ships) > 0: group_comb_counts.sort(key=lambda x: x[1]) + # reset the remaining_group list remaining_group = [] + # g is the identifier for the group for g, _ in group_comb_counts: if len(remaining_ships) == 0: break + # compute for each group, the selected ship, and the combined count increase if group_comb_counts.index((g, _)) == 0: selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true) - else: max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true) selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count) + # if the combined increase is 0, then we process it in a special manner if comb_increase == 0: remaining_group.append(g) else: diff --git a/interpretation/fold_analysis_t5.py b/interpretation/fold_analysis_t5.py index 9144972..8900b8d 100644 --- a/interpretation/fold_analysis_t5.py +++ b/interpretation/fold_analysis_t5.py @@ -2,7 +2,7 @@ # %% import pandas as pd import os -from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder +from inference import Inference, Embedder_t5_encoder import numpy as np from sklearn.manifold import TSNE import matplotlib.pyplot as plt diff --git a/post_process/rule_based_correction/.gitignore b/post_process/rule_based_correction/.gitignore new file mode 100644 index 0000000..11fdf6f --- /dev/null +++ b/post_process/rule_based_correction/.gitignore @@ -0,0 +1,3 @@ +*.csv +fold_* +__pycache__ \ No newline at end of file diff --git a/post_process/rule_based_correction/post_processing_rules.py b/post_process/rule_based_correction/post_processing_rules.py new file mode 100644 index 0000000..bbf723f --- /dev/null +++ b/post_process/rule_based_correction/post_processing_rules.py @@ -0,0 +1,154 @@ +# Shaft# class post-processing +shaft_rules = [ + { "conditions": [ + lambda x: "NO.1" in x, + lambda x: "SHAFT" in x + ], + "action": 'Shaft1' + }, + { "conditions": [ + lambda x: "NO.1" in x, + lambda x: "Shaft" in x + ], + "action": 'Shaft1' + }, + { "conditions": [ + lambda x: "NO.2" in x, + lambda x: "Shaft" in x + ], + "action": 'Shaft2' + }, + { "conditions": [ + lambda x: "NO.2" in x, + lambda x: "SHAFT" in x + ], + "action": 'Shaft2' + }, + + { "conditions": [ + lambda x: "NO.1" not in x, + lambda x: "NO.2" not in x, + lambda x: "SHAFT" not in x + ], + "action": 'Shaft1' + }, + + { "conditions": [ + lambda x: "NO.1" not in x, + lambda x: "NO.2" not in x, + lambda x: "SHAFT" in x, + lambda x: "(P)" in x + ], + "action": 'Shaft2' + }, + + { "conditions": [ + lambda x: "NO.1" not in x, + lambda x: "NO.2" not in x, + lambda x: "SHAFT" in x, + lambda x: "(S)" in x + ], + "action": 'Shaft1' + }, + + { "conditions": [ + lambda x: "NO.1" not in x, + lambda x: "NO.2" not in x, + lambda x: "SHAFT" in x, + lambda x: "(S)" not in x, + lambda x: "(P)" not in x + ], + "action": 'Shaft1' + }, +] + + +# ME# class post-processing +ME_rules = [ + { "conditions": [ + lambda x: "ME" in x, + lambda x: "(P)" not in x, + lambda x: "(S)" not in x, + lambda x: "GE" not in x, + lambda x: "FLOW" in x, + ], + "action": 'ME1Flow' + }, + { "conditions": [ + lambda x: "ME" in x, + lambda x: "(P)" in x, + lambda x: "FLOW" in x, + ], + "action": 'ME2Flow' + }, + + { "conditions": [ + lambda x: "ME" in x, + lambda x: "(S)" in x, + lambda x: "FLOW" in x, + ], + "action": 'ME1Flow' + }, + + + { "conditions": [ + lambda x: "ME" not in x, + lambda x: "GE" not in x, + lambda x: "(P)" not in x, + lambda x: "(S)" not in x, + lambda x: "FLOW" in x, + ], + "action": 'ME1Flow' + }, + { "conditions": [ + lambda x: "ME" in x, + lambda x: "GE" not in x, + lambda x: "(P)" not in x, + lambda x: "(S)" not in x, + lambda x: "FLOW" not in x, + lambda x: "CONSUMPTION" in x, + ], + "action": 'ME1Flow' + }, +] + +# GEFlow rules +GEFlow_rules = [ + { "conditions": [lambda x: "NO." not in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE1Flow' + }, + { "conditions": [ lambda x: "NO.1" in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE1Flow' + }, + + { "conditions": [ lambda x: "NO.2" in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE2Flow' + }, + { "conditions": [ lambda x: "NO.3" in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE3Flow' + }, + + { "conditions": [ lambda x: "NO." not in x, + lambda x: "GE" in x, + lambda x: "CONSUMPTION" in x, + + ], + "action": 'GE1Flow' + }, +] \ No newline at end of file diff --git a/post_process/rule_based_correction/post_processor_gawi.py b/post_process/rule_based_correction/post_processor_gawi.py new file mode 100644 index 0000000..6529a52 --- /dev/null +++ b/post_process/rule_based_correction/post_processor_gawi.py @@ -0,0 +1,233 @@ +# %% +import os +import re + +import pandas as pd +from sklearn.metrics import classification_report + +from post_processing_rules import shaft_rules, ME_rules, GEFlow_rules + + +# %% +# Function to print classification metrics +def print_classification_metrics(df): + print("Results before post-processing") + # print(classification_report(df["thing"], df["p_thing"].fillna(""))) + report = classification_report(df["thing"], df["p_thing"].fillna(""), output_dict=True) + # Extracting the weighted average values for precision, recall, and F1-score + precision_avg = report['weighted avg']['precision'] + recall_avg = report['weighted avg']['recall'] + f1_avg = report['weighted avg']['f1-score'] + # Print the averages + print("Average Precision:", precision_avg) + print("Average Recall:", recall_avg) + print("Average F1-Score:", f1_avg) + + print("**************") + + print("Results after post-processing") + # print(classification_report(df["thing"], df["edited_p_thing"].fillna(""))) + report = classification_report(df["thing"], df["edited_p_thing"].fillna(""), output_dict=True) + # Extracting the weighted average values for precision, recall, and F1-score + precision_avg = report['weighted avg']['precision'] + recall_avg = report['weighted avg']['recall'] + f1_avg = report['weighted avg']['f1-score'] + # Print the averages + print("Average Precision:", precision_avg) + print("Average Recall:", recall_avg) + print("Average F1-Score:", f1_avg) + + +def read_data(): + BASE_FOLDER = "fold" + BASE_FILE_NAME = "result_group" + NUM_FOLDS = 5 + + # List for storing all DataFrames + dataframes = [] + + # Iterate over all fold + for i in range(NUM_FOLDS): + fold_folder = f"{BASE_FOLDER}_{i+1}" + desired_file_name = f"{BASE_FILE_NAME}_{i+1}.csv" + for file_name in os.listdir(fold_folder): + if file_name==desired_file_name: + file_path = os.path.join(fold_folder, file_name) + + df = pd.read_csv(file_path, index_col=0) + dataframes.append(df) + + # Combine all DataFrames into one + final_dataframe = pd.concat(dataframes) + final_dataframe = final_dataframe[final_dataframe["MDM"]].reset_index(drop=True) + # assign a copy + final_dataframe["edited_p_thing"] = final_dataframe["p_thing"] + return final_dataframe + + +def update_shipboiler_p_thing(df, tag_column='tag_description', thing_pred_column='edited_p_thing'): + """ + Update the 'thing' column in the DataFrame based on rules applied to 'tag_description' column. + + Parameters: + - df: DataFrame to apply the function on. + - tag_column: Column name containing descriptions to base the logic on. + - thing_pred_column: Column to be updated with new values based on conditions. + + Returns: + - DataFrame with updated 'thing' column. + """ + # Fill NaN values in tag column to avoid errors with ~ operator + df[tag_column] = df[tag_column].fillna('') + + # Apply rules for "NO.1", "NO.2", and "AUXILIARY" in tag_column + df.loc[df[tag_column].str.contains("NO.1", case=False), thing_pred_column] = "ShipBoiler1" + df.loc[df[tag_column].str.contains("NO.2", case=False), thing_pred_column] = "ShipBoiler2" + df.loc[(df[tag_column].str.contains("AUXILIARY", case=False)) & (~df[tag_column].str.contains("NO.", case=False)), thing_pred_column] = "ShipBoiler1" + + # Determine the highest number for "thing" in rows without "COMPOSITE" and assign to "COMPOSITE" rows + max_boiler_number = df.loc[~df[tag_column].str.contains("COMPOSITE", case=False) & df[thing_pred_column].str.contains("ShipBoiler", na=False), thing_pred_column] \ + .str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1 + composite_boiler = f"ShipBoiler{int(max_boiler_number)}" + + # Apply composite_boiler to rows containing "COMPOSITE" + df.loc[df[tag_column].str.contains("COMPOSITE", case=False), thing_pred_column] = composite_boiler + + return df + + +def process_boiler_data(boiler_data): + updated_dataframes = [] + for _, group in boiler_data.groupby('ships_idx'): + group_copy = group.copy() + + group_copy["tag_description"] = group_copy["tag_description"].fillna('') + + contains_no1_cond = group_copy["tag_description"].str.contains("NO.1") + contains_no2_cond = group_copy["tag_description"].str.contains("NO.2") + contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY") + doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO.") + contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE") + contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler") + + group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1" + group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2" + group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1" + + if ((~contains_comp_cond) & (contains_shipboiler_cond)).any(): + max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1 + composite_boiler = f"ShipBoiler{int(max_boiler_number)}" + + if max_boiler_number > 3: + max_boiler_number = 3 + + group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler + else: + group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1" + + updated_dataframes.append(group_copy) # Collect updated group + + # Step 2: Concatenate all updated groups + updated_boiler_data = pd.concat(updated_dataframes) + return updated_boiler_data + + +def check_conditions(value,conditions): +# Check if a value satisfies all conditions + return all(condition(value) for condition in conditions) + + +def apply_rules(description, thing, rules): +#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value + for rule in rules: + if check_conditions(description, rule["conditions"]): #Check that all conditions are met + return rule["action"] #Execute the action and return the result + return thing #Returns the value of the thing column if it doesn't match any of the rules + + +# %% +# if __name__ == "__main__": + +# %% +# Read and preprocess data +final_dataframe = read_data() + +# %% +final_dataframe +# %% +# Hwanggawi main function +#Get partial columns +TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy() + +#Shaft +SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)] +SF_df_in_MDM = SF_df[(SF_df['MDM'])] + +#ME +ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)| + TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)] +ME_df_in_MDM = ME_df[(ME_df['MDM'])] + +#GE +GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)| + TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)] +GE_df_in_MDM = GE_df[(GE_df['MDM'])] + +SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy() +GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy() +ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy() + +# ShipBoiler class post-processing +mdm = final_dataframe[final_dataframe["MDM"]].copy() +boiler_data = mdm[mdm["thing"].str.contains("Boiler")].copy() + +blr_cond = boiler_data["tag_description"].str.lower().str.contains("blr") +boiler_cond = boiler_data["tag_description"].str.lower().str.contains("boiler") +boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0]) +different_cond = boiler_data[~(blr_cond|boiler_cond)].copy() + +unique_ships_idxs = boiler_data["ships_idx"].unique() + +boiler_data["edited_p_thing"] = boiler_data["p_thing"] + +updated_boiler_data = process_boiler_data(boiler_data) + +# Save updated data back to the original DataFrame +final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"] + + +result = SF_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1) +SF_df_in_MDM['edited_p_thing'] = result + +# Save updated data back to the original DataFrame +final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing'] + + +result = ME_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1) +ME_df_in_MDM['edited_p_thing'] = result +final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing'] + + +result = GE_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1) +GE_df_in_MDM['edited_p_thing'] = result +final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing'] + + +# output final dataframe +final_dataframe.to_csv("post_processed_df.csv", index=False) +print("Saved output to post_processed_df.csv") +# %% +# print results +print("ShipBoiler post-processing results") +print_classification_metrics(updated_boiler_data) +print("----------------------------") + +print("ShipBoiler post-processing results") +print_classification_metrics(ME_df_in_MDM) +print("----------------------------") + +print("ShipBoiler post-processing results") +print_classification_metrics(GE_df_in_MDM) +print("----------------------------") + +# %% diff --git a/end_to_end/.gitignore b/production/.gitignore similarity index 100% rename from end_to_end/.gitignore rename to production/.gitignore diff --git a/end_to_end/.README.md b/production/README.md similarity index 100% rename from end_to_end/.README.md rename to production/README.md diff --git a/end_to_end/deduplication.py b/production/end_to_end/deduplication.py similarity index 99% rename from end_to_end/deduplication.py rename to production/end_to_end/deduplication.py index 37073ad..802e468 100644 --- a/end_to_end/deduplication.py +++ b/production/end_to_end/deduplication.py @@ -265,7 +265,7 @@ def run_deduplication( print('generate train embeddings') train_embedder = Embedder(input_df=train_df, batch_size=batch_size) tensor = train_embedder.make_embedding(checkpoint_path) - torch.save(tensor, file_path, weights_only=True) + torch.save(tensor, file_path) print("Tensor saved to file.") train_embeds = tensor diff --git a/end_to_end/mapper.py b/production/end_to_end/mapper.py similarity index 100% rename from end_to_end/mapper.py rename to production/end_to_end/mapper.py diff --git a/production/end_to_end/post_processing_rules.py b/production/end_to_end/post_processing_rules.py new file mode 100644 index 0000000..c20bfeb --- /dev/null +++ b/production/end_to_end/post_processing_rules.py @@ -0,0 +1,158 @@ +# Shaft# class post-processing +shaft_rules = [ + { "conditions": [ + lambda x: "NO1" in x, + lambda x: "SHAFT" in x + ], + "action": 'Shaft1' + }, + { "conditions": [ + lambda x: "NO1" in x, + lambda x: "Shaft" in x + ], + "action": 'Shaft1' + }, + { "conditions": [ + lambda x: "NO2" in x, + lambda x: "Shaft" in x + ], + "action": 'Shaft2' + }, + { "conditions": [ + lambda x: "NO2" in x, + lambda x: "SHAFT" in x + ], + "action": 'Shaft2' + }, + + { "conditions": [ + lambda x: "NO1" not in x, + lambda x: "NO2" not in x, + lambda x: "SHAFT" not in x + ], + "action": 'Shaft1' + }, + + { "conditions": [ + lambda x: "NO1" not in x, + lambda x: "NO2" not in x, + lambda x: "SHAFT" in x, + lambda x: "(P)" in x + ], + "action": 'Shaft2' + }, + + { "conditions": [ + lambda x: "NO1" not in x, + lambda x: "NO2" not in x, + lambda x: "SHAFT" in x, + lambda x: "(S)" in x + ], + "action": 'Shaft1' + }, + + { "conditions": [ + lambda x: "NO1" not in x, + lambda x: "NO2" not in x, + lambda x: "SHAFT" in x, + lambda x: "(S)" not in x, + lambda x: "(P)" not in x + ], + "action": 'Shaft1' + }, +] + + +# ME# class post-processing +ME_rules = [ + { "conditions": [ + lambda x: "ME" in x, + lambda x: "(P)" not in x, + lambda x: "(S)" not in x, + lambda x: "GE" not in x, + lambda x: "FLOW" in x, + ], + "action": 'ME1Flow' + }, + { "conditions": [ + lambda x: "ME" in x, + lambda x: "(P)" in x, + lambda x: "FLOW" in x, + ], + "action": 'ME2Flow' + }, + + { "conditions": [ + lambda x: "ME" in x, + lambda x: "(S)" in x, + lambda x: "FLOW" in x, + ], + "action": 'ME1Flow' + }, + + + { "conditions": [ + lambda x: "ME" not in x, + lambda x: "GE" not in x, + lambda x: "(P)" not in x, + lambda x: "(S)" not in x, + lambda x: "FLOW" in x, + ], + "action": 'ME1Flow' + }, + { "conditions": [ + lambda x: "ME" in x, + lambda x: "GE" not in x, + lambda x: "(P)" not in x, + lambda x: "(S)" not in x, + lambda x: "FLOW" not in x, + lambda x: "CONSUMPTION" in x, + ], + "action": 'ME1Flow' + }, +] + +# GEFlow rules +GEFlow_rules = [ + { "conditions": [ + lambda x: "NO" not in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + ], + "action": 'GE1Flow' + }, + { "conditions": [ + lambda x: "NO1" in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE1Flow' + }, + + { "conditions": [ + lambda x: "NO2" in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE2Flow' + }, + { "conditions": [ + lambda x: "NO3" in x, + lambda x: "GE" in x, + lambda x: "MGO" in x, + + ], + "action": 'GE3Flow' + }, + + { "conditions": [ + lambda x: "NO" not in x, + lambda x: "GE" in x, + lambda x: "CONSUMPTION" in x, + + ], + "action": 'GE1Flow' + }, +] \ No newline at end of file diff --git a/end_to_end/preprocess.py b/production/end_to_end/preprocess.py similarity index 97% rename from end_to_end/preprocess.py rename to production/end_to_end/preprocess.py index a21f61a..d85120c 100644 --- a/end_to_end/preprocess.py +++ b/production/end_to_end/preprocess.py @@ -1,6 +1,6 @@ # %% import re -from replacement_dict import desc_replacement_dict, unit_replacement_dict +from end_to_end.replacement_dict import desc_replacement_dict, unit_replacement_dict class Abbreviator: diff --git a/end_to_end/replacement_dict.py b/production/end_to_end/replacement_dict.py similarity index 100% rename from end_to_end/replacement_dict.py rename to production/end_to_end/replacement_dict.py diff --git a/production/end_to_end/rule_based_correction.py b/production/end_to_end/rule_based_correction.py new file mode 100644 index 0000000..1b74a0f --- /dev/null +++ b/production/end_to_end/rule_based_correction.py @@ -0,0 +1,122 @@ +import os +import re + +import pandas as pd + +from end_to_end.post_processing_rules import shaft_rules, ME_rules, GEFlow_rules + +class Corrector(): + + def __init__(self, df): + # copy over the existing + df['edited_p_thing'] = df['p_thing'].copy() + self.df = df + + + def _process_boiler_data(self, boiler_data): + updated_dataframes = [] + for _, group in boiler_data.groupby('ships_idx'): + group_copy = group.copy() + + group_copy["tag_description"] = group_copy["tag_description"].fillna('') + + contains_no1_cond = group_copy["tag_description"].str.contains("NO1") + contains_no2_cond = group_copy["tag_description"].str.contains("NO2") + contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY") + doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO") + contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE") + contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler") + + group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1" + group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2" + group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1" + + if ((~contains_comp_cond) & (contains_shipboiler_cond)).any(): + max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1 + composite_boiler = f"ShipBoiler{int(max_boiler_number)}" + + if max_boiler_number > 3: + max_boiler_number = 3 + + group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler + else: + group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1" + + updated_dataframes.append(group_copy) # Collect updated group + + # Step 2: Concatenate all updated groups + if (len(updated_dataframes) == 0): + return boiler_data + updated_boiler_data = pd.concat(updated_dataframes) + return updated_boiler_data + + def _check_conditions(self, value, conditions): + # Check if a value satisfies all conditions + return all(condition(value) for condition in conditions) + + def _apply_rules(self, description, thing, rules): + #Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value + for rule in rules: + if self._check_conditions(description, rule["conditions"]): #Check that all conditions are met + return rule["action"] #Execute the action and return the result + return thing #Returns the value of the thing column if it doesn't match any of the rules + + def run_correction(self): + final_dataframe = self.df.copy() + # Hwanggawi main function + #Get partial columns + TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy() + + #Shaft + SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)] + SF_df_in_MDM = SF_df[(SF_df['MDM'])] + + #ME + ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)| + TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)| + TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)] + ME_df_in_MDM = ME_df[(ME_df['MDM'])] + + #GE + GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)| + TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)| + TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)] + GE_df_in_MDM = GE_df[(GE_df['MDM'])] + + SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy() + GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy() + ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy() + + # ShipBoiler class post-processing + mdm = final_dataframe[final_dataframe["MDM"]].copy() + boiler_data = mdm[mdm["thing"].str.contains("BOILER")].copy() + + # blr_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER") + # boiler_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER") + # boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0]) + + # different_cond = boiler_data[~(blr_cond|boiler_cond)].copy() + + # unique_ships_idxs = boiler_data["ships_idx"].unique() + + boiler_data["edited_p_thing"] = boiler_data["p_thing"] + updated_boiler_data = self._process_boiler_data(boiler_data) + final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"] + + result = SF_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1) + SF_df_in_MDM['edited_p_thing'] = result + final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing'] + + result = ME_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1) + ME_df_in_MDM['edited_p_thing'] = result + final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing'] + + + result = GE_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1) + GE_df_in_MDM['edited_p_thing'] = result + final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing'] + + # override p_thing with edited_p_thing + final_dataframe['p_thing'] = final_dataframe['edited_p_thing'].copy() + + return final_dataframe \ No newline at end of file diff --git a/end_to_end/run.py b/production/run.py similarity index 82% rename from end_to_end/run.py rename to production/run.py index df921ac..1474474 100644 --- a/end_to_end/run.py +++ b/production/run.py @@ -2,13 +2,16 @@ import pandas as pd import os import glob -from mapper import Mapper -from preprocess import Abbreviator -from deduplication import run_deduplication +from end_to_end.mapper import Mapper +from end_to_end.preprocess import Abbreviator +from end_to_end.deduplication import run_deduplication +from end_to_end.rule_based_correction import Corrector + # global config BATCH_SIZE = 512 SHIPS_LIST = [1000,1001,1003,1004] +# SHIPS_LIST = [1000] # %% # START: we import the raw data csv and extract only a few ships from it to simulate incoming json @@ -47,6 +50,12 @@ df_out = pd.DataFrame({ # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] df = pd.concat([df, df_out], axis=1) +# %% +################################### +# run rule-based correction +corrector = Corrector(df) +df = corrector.run_correction() + # %% #################################### @@ -59,7 +68,7 @@ df = run_deduplication( test_df=df, train_df=train_df, batch_size=BATCH_SIZE, - threshold=0.85, + threshold=0.9, diagnostic=True) # %% diff --git a/train/mapping_t5_final_delivery_model/.gitignore b/train/mapping_t5_final_delivery_model/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/train/mapping_t5_final_delivery_model/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/train/mapping_t5_final_delivery_model/train.py b/train/mapping_t5_final_delivery_model/train.py new file mode 100644 index 0000000..7b6227e --- /dev/null +++ b/train/mapping_t5_final_delivery_model/train.py @@ -0,0 +1,196 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(): + # train + data_path = "../../data_preprocess/exports/preprocessed_data.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + train_df = train_df[train_df['MDM']].reset_index(drop=True) + + # valid + data_path = "../../data_preprocess/exports/dataset/group_1/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(): + save_path = 'checkpoint' + split_datasets = create_split_dataset() + + # prepare tokenizer + + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 64 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +train() +