Feat: added post_processing based on rules

others:
- added basic data analysis to get histograms of text differences
- added new final delivery model
This commit is contained in:
Richard Wong 2024-12-18 13:43:56 +09:00
parent 481bcf88b7
commit c5760d127d
23 changed files with 1178 additions and 10 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
*.zip
*.zip
post_processor

View File

@ -7,6 +7,14 @@ full_df = pd.read_csv(data_path, skipinitialspace=True)
mdm_list = sorted(list((set(full_df['pattern']))))
# %%
full_df
# %%
mask1 = full_df['thing'] == 'ME1TurboCharger1'
mask2 = full_df['property'] == 'LOInletPress'
mask = mask1 & mask2
full_df[mask]
# %%
len(mdm_list)
# %%
@ -16,3 +24,12 @@ tp_list = sorted(list(set(thing_property)))
# %%
len(tp_list)
# %%
data_path = '../../data_import/exports/raw_data.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
bad_df = df[~df['MDM']]
# %%
bad_df[bad_df['thing'] == '$UNMAPPED']
# %%

View File

@ -9,5 +9,5 @@ df = pd.read_csv(data_path)
df
# %%
set(df['signal_type'])
len(set(df['ships_idx']))
# %%

View File

@ -0,0 +1,71 @@
# %%
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
# note: we assume that you will execute from the directory of this code
# check your current directory
print("Current Working Directory:", os.getcwd())
# %%
# plt.rcParams.update({'font.size': 18})
df = pd.read_csv('../../data_import/exports/raw_data.csv')
total_counts = df['ships_idx'].value_counts().sort_index()
mdm_true_counts = df[df['MDM']]['ships_idx'].value_counts().sort_index()
summary_df = pd.DataFrame({
'SD': total_counts,
'PD': mdm_true_counts
}).fillna(0)
total_SD = summary_df['SD'].sum()
total_PD = summary_df['PD'].sum()
print(f"Total SD: {total_SD}")
print(f"Total PD: {total_PD}")
# %%
plt.figure(figsize=(8, 6))
fig, ax = plt.subplots(figsize=(8, 6))
summary_df['SD'].plot(
kind='bar',
ax=ax,
color='orange',
alpha=0.5,
label='Ship Domain',
width=0.8)
summary_df['PD'].plot(
kind='bar',
ax=ax,
color='blue',
alpha=0.7,
label='Platform Domain',
width=0.8)
x_labels = ax.get_xticks()
ax.set_xticks(np.arange(min(x_labels), max(x_labels) + 1, 10))
ax.set_xticklabels(
[int(label) for label in np.arange(min(x_labels), max(x_labels) + 1, 10)],
rotation=0,
)
ax.grid(True)
# plt.legend(prop={'size': 18})
plt.legend()
plt.ylabel('Counts')
plt.xlabel('Ships')
plt.savefig('count_statistics_of_each_ship.png')
plt.show()
# %%

View File

@ -0,0 +1,38 @@
# %%
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
# note: we assume that you will execute from the directory of this code
# check your current directory
print("Current Working Directory:", os.getcwd())
# %%
data_path = "../../data_preprocess/exports/combined_group_allocation.csv"
df = pd.read_csv(data_path)
# %%
df
# %%
print('mean', df[df['Allocation'] == 'BGKF']['Comb_count'].mean())
print('std', df[df['Allocation'] == 'BGKF']['Comb_count'].std())
max = df[df['Allocation'] == 'BGKF']['Comb_count'].max()
min = df[df['Allocation'] == 'BGKF']['Comb_count'].min()
print('max', max)
print('max', min)
print('max - min', max - min)
# %%
print('mean', df[df['Allocation'] == 'GKF']['Comb_count'].mean())
print('std', df[df['Allocation'] == 'GKF']['Comb_count'].std())
max = df[df['Allocation'] == 'GKF']['Comb_count'].max()
min = df[df['Allocation'] == 'GKF']['Comb_count'].min()
print('max', max)
print('max', min)
print('max - min', max - min)
# %%

View File

@ -0,0 +1,72 @@
# %%
import pandas as pd
import Levenshtein
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
# %%
data_path = '../../data_import/exports/data_mapping_mdm.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
mdm_list = sorted(list((set(df['pattern']))))
# %%
df['thing_property'] = df['thing'] + df['property']
# %%
def compute_norm_leven(string1, string2):
max_distance = max(len(string1), len(string2))
leven_distance = Levenshtein.distance(string1, string2)
norm_leven = leven_distance / max_distance
return norm_leven
# %%
n = len(df)
distance_array = np.zeros((n), dtype=float)
desc_array = df['tag_description']
thing_property_array = df['thing_property']
# %%
# compute normalized levenshtein distance
for i in tqdm(range(n)):
string1 = desc_array[i]
string2 = thing_property_array[i]
distance_array[i] = compute_norm_leven(string1, string2)
# %%
distance_array
# %%
plt.figure(figsize=(8, 6))
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel("Normalized Levenshtein Distance")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("histogram.png", dpi=300)
#
# %%
# summary statistics of computed levenshtein distance
def summary_stats(arr):
return {
"Mean": np.mean(arr),
"Median": np.median(arr),
"Standard Deviation": np.std(arr),
"Variance": np.var(arr),
"Min": np.min(arr),
"Max": np.max(arr),
"Range": np.ptp(arr),
"25th Percentile": np.percentile(arr, 25),
"75th Percentile": np.percentile(arr, 75),
"Sum": np.sum(arr),
}
stats = summary_stats(distance_array)
for key, value in stats.items():
print(f"{key}: {value}")
# %%

View File

@ -0,0 +1,88 @@
# %%
import pandas as pd
import Levenshtein
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
# %%
data_path = '../../data_import/exports/data_mapping_mdm.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
df['thing_property'] = df['thing'] + df['property']
mdm_list = sorted(list((set(df['thing_property']))))
# %%
def compute_norm_leven(string1, string2):
max_distance = max(len(string1), len(string2))
leven_distance = Levenshtein.distance(string1, string2)
norm_leven = leven_distance / max_distance
return norm_leven
def compute_avg_score(strings):
n = len(strings)
# if group only has 1 string, then it is fully similar to itself
if n == 1:
return 0
# Create an empty matrix
distance_matrix = np.zeros((n, n), dtype=float)
# Fill only the upper triangular part
for i in range(n):
for j in range(i + 1, n):
dist = compute_norm_leven(strings[i], strings[j])
distance_matrix[i, j] = dist
upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
mean_distance = np.mean(upper_triangular_distances)
return mean_distance
# %%
# we want to subset to each class
n = len(mdm_list)
score_list = np.zeros((n), dtype=float)
for i in range(n):
df_subset = df[df['thing_property'] == mdm_list[i]]
strings = df_subset['tag_description'].to_numpy()
score_list[i] = compute_avg_score(strings)
# %%
score_list
# %%
# plt.hist(score_list, bins=50)
plt.figure(figsize=(8, 6))
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel("Normalized Levenshtein Distance")
plt.ylabel("Platform Domain Class Count")
plt.tight_layout()
plt.savefig("histogram.png", dpi=300)
# %%
# summary statistics of computed levenshtein distance
def summary_stats(arr):
return {
"Mean": np.mean(arr),
"Median": np.median(arr),
"Standard Deviation": np.std(arr),
"Variance": np.var(arr),
"Min": np.min(arr),
"Max": np.max(arr),
"Range": np.ptp(arr),
"25th Percentile": np.percentile(arr, 25),
"75th Percentile": np.percentile(arr, 75),
"Sum": np.sum(arr),
}
stats = summary_stats(score_list)
for key, value in stats.items():
print(f"{key}: {value}")
# %%

View File

@ -181,6 +181,7 @@ remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
# Allocate remaining ships to the groups
while len(remaining_ships) > 0:
# re-compute the counts for each group
group_comb_counts = []
for g in range(num_groups):
group_ships = groups[g]
@ -190,18 +191,21 @@ while len(remaining_ships) > 0:
group_comb_counts.sort(key=lambda x: x[1])
# reset the remaining_group list
remaining_group = []
# g is the identifier for the group
for g, _ in group_comb_counts:
if len(remaining_ships) == 0:
break
# compute for each group, the selected ship, and the combined count increase
if group_comb_counts.index((g, _)) == 0:
selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
else:
max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
# if the combined increase is 0, then we process it in a special manner
if comb_increase == 0:
remaining_group.append(g)
else:

View File

@ -2,7 +2,7 @@
# %%
import pandas as pd
import os
from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
from inference import Inference, Embedder_t5_encoder
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

View File

@ -0,0 +1,3 @@
*.csv
fold_*
__pycache__

View File

@ -0,0 +1,154 @@
# Shaft# class post-processing
shaft_rules = [
{ "conditions": [
lambda x: "NO.1" in x,
lambda x: "SHAFT" in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO.1" in x,
lambda x: "Shaft" in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO.2" in x,
lambda x: "Shaft" in x
],
"action": 'Shaft2'
},
{ "conditions": [
lambda x: "NO.2" in x,
lambda x: "SHAFT" in x
],
"action": 'Shaft2'
},
{ "conditions": [
lambda x: "NO.1" not in x,
lambda x: "NO.2" not in x,
lambda x: "SHAFT" not in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO.1" not in x,
lambda x: "NO.2" not in x,
lambda x: "SHAFT" in x,
lambda x: "(P)" in x
],
"action": 'Shaft2'
},
{ "conditions": [
lambda x: "NO.1" not in x,
lambda x: "NO.2" not in x,
lambda x: "SHAFT" in x,
lambda x: "(S)" in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO.1" not in x,
lambda x: "NO.2" not in x,
lambda x: "SHAFT" in x,
lambda x: "(S)" not in x,
lambda x: "(P)" not in x
],
"action": 'Shaft1'
},
]
# ME# class post-processing
ME_rules = [
{ "conditions": [
lambda x: "ME" in x,
lambda x: "(P)" not in x,
lambda x: "(S)" not in x,
lambda x: "GE" not in x,
lambda x: "FLOW" in x,
],
"action": 'ME1Flow'
},
{ "conditions": [
lambda x: "ME" in x,
lambda x: "(P)" in x,
lambda x: "FLOW" in x,
],
"action": 'ME2Flow'
},
{ "conditions": [
lambda x: "ME" in x,
lambda x: "(S)" in x,
lambda x: "FLOW" in x,
],
"action": 'ME1Flow'
},
{ "conditions": [
lambda x: "ME" not in x,
lambda x: "GE" not in x,
lambda x: "(P)" not in x,
lambda x: "(S)" not in x,
lambda x: "FLOW" in x,
],
"action": 'ME1Flow'
},
{ "conditions": [
lambda x: "ME" in x,
lambda x: "GE" not in x,
lambda x: "(P)" not in x,
lambda x: "(S)" not in x,
lambda x: "FLOW" not in x,
lambda x: "CONSUMPTION" in x,
],
"action": 'ME1Flow'
},
]
# GEFlow rules
GEFlow_rules = [
{ "conditions": [lambda x: "NO." not in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE1Flow'
},
{ "conditions": [ lambda x: "NO.1" in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE1Flow'
},
{ "conditions": [ lambda x: "NO.2" in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE2Flow'
},
{ "conditions": [ lambda x: "NO.3" in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE3Flow'
},
{ "conditions": [ lambda x: "NO." not in x,
lambda x: "GE" in x,
lambda x: "CONSUMPTION" in x,
],
"action": 'GE1Flow'
},
]

View File

@ -0,0 +1,233 @@
# %%
import os
import re
import pandas as pd
from sklearn.metrics import classification_report
from post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
# %%
# Function to print classification metrics
def print_classification_metrics(df):
print("Results before post-processing")
# print(classification_report(df["thing"], df["p_thing"].fillna("")))
report = classification_report(df["thing"], df["p_thing"].fillna(""), output_dict=True)
# Extracting the weighted average values for precision, recall, and F1-score
precision_avg = report['weighted avg']['precision']
recall_avg = report['weighted avg']['recall']
f1_avg = report['weighted avg']['f1-score']
# Print the averages
print("Average Precision:", precision_avg)
print("Average Recall:", recall_avg)
print("Average F1-Score:", f1_avg)
print("**************")
print("Results after post-processing")
# print(classification_report(df["thing"], df["edited_p_thing"].fillna("")))
report = classification_report(df["thing"], df["edited_p_thing"].fillna(""), output_dict=True)
# Extracting the weighted average values for precision, recall, and F1-score
precision_avg = report['weighted avg']['precision']
recall_avg = report['weighted avg']['recall']
f1_avg = report['weighted avg']['f1-score']
# Print the averages
print("Average Precision:", precision_avg)
print("Average Recall:", recall_avg)
print("Average F1-Score:", f1_avg)
def read_data():
BASE_FOLDER = "fold"
BASE_FILE_NAME = "result_group"
NUM_FOLDS = 5
# List for storing all DataFrames
dataframes = []
# Iterate over all fold
for i in range(NUM_FOLDS):
fold_folder = f"{BASE_FOLDER}_{i+1}"
desired_file_name = f"{BASE_FILE_NAME}_{i+1}.csv"
for file_name in os.listdir(fold_folder):
if file_name==desired_file_name:
file_path = os.path.join(fold_folder, file_name)
df = pd.read_csv(file_path, index_col=0)
dataframes.append(df)
# Combine all DataFrames into one
final_dataframe = pd.concat(dataframes)
final_dataframe = final_dataframe[final_dataframe["MDM"]].reset_index(drop=True)
# assign a copy
final_dataframe["edited_p_thing"] = final_dataframe["p_thing"]
return final_dataframe
def update_shipboiler_p_thing(df, tag_column='tag_description', thing_pred_column='edited_p_thing'):
"""
Update the 'thing' column in the DataFrame based on rules applied to 'tag_description' column.
Parameters:
- df: DataFrame to apply the function on.
- tag_column: Column name containing descriptions to base the logic on.
- thing_pred_column: Column to be updated with new values based on conditions.
Returns:
- DataFrame with updated 'thing' column.
"""
# Fill NaN values in tag column to avoid errors with ~ operator
df[tag_column] = df[tag_column].fillna('')
# Apply rules for "NO.1", "NO.2", and "AUXILIARY" in tag_column
df.loc[df[tag_column].str.contains("NO.1", case=False), thing_pred_column] = "ShipBoiler1"
df.loc[df[tag_column].str.contains("NO.2", case=False), thing_pred_column] = "ShipBoiler2"
df.loc[(df[tag_column].str.contains("AUXILIARY", case=False)) & (~df[tag_column].str.contains("NO.", case=False)), thing_pred_column] = "ShipBoiler1"
# Determine the highest number for "thing" in rows without "COMPOSITE" and assign to "COMPOSITE" rows
max_boiler_number = df.loc[~df[tag_column].str.contains("COMPOSITE", case=False) & df[thing_pred_column].str.contains("ShipBoiler", na=False), thing_pred_column] \
.str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
# Apply composite_boiler to rows containing "COMPOSITE"
df.loc[df[tag_column].str.contains("COMPOSITE", case=False), thing_pred_column] = composite_boiler
return df
def process_boiler_data(boiler_data):
updated_dataframes = []
for _, group in boiler_data.groupby('ships_idx'):
group_copy = group.copy()
group_copy["tag_description"] = group_copy["tag_description"].fillna('')
contains_no1_cond = group_copy["tag_description"].str.contains("NO.1")
contains_no2_cond = group_copy["tag_description"].str.contains("NO.2")
contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO.")
contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
if max_boiler_number > 3:
max_boiler_number = 3
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
else:
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
updated_dataframes.append(group_copy) # Collect updated group
# Step 2: Concatenate all updated groups
updated_boiler_data = pd.concat(updated_dataframes)
return updated_boiler_data
def check_conditions(value,conditions):
# Check if a value satisfies all conditions
return all(condition(value) for condition in conditions)
def apply_rules(description, thing, rules):
#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
for rule in rules:
if check_conditions(description, rule["conditions"]): #Check that all conditions are met
return rule["action"] #Execute the action and return the result
return thing #Returns the value of the thing column if it doesn't match any of the rules
# %%
# if __name__ == "__main__":
# %%
# Read and preprocess data
final_dataframe = read_data()
# %%
final_dataframe
# %%
# Hwanggawi main function
#Get partial columns
TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
#Shaft
SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
SF_df_in_MDM = SF_df[(SF_df['MDM'])]
#ME
ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
ME_df_in_MDM = ME_df[(ME_df['MDM'])]
#GE
GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
GE_df_in_MDM = GE_df[(GE_df['MDM'])]
SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
# ShipBoiler class post-processing
mdm = final_dataframe[final_dataframe["MDM"]].copy()
boiler_data = mdm[mdm["thing"].str.contains("Boiler")].copy()
blr_cond = boiler_data["tag_description"].str.lower().str.contains("blr")
boiler_cond = boiler_data["tag_description"].str.lower().str.contains("boiler")
boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
unique_ships_idxs = boiler_data["ships_idx"].unique()
boiler_data["edited_p_thing"] = boiler_data["p_thing"]
updated_boiler_data = process_boiler_data(boiler_data)
# Save updated data back to the original DataFrame
final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
result = SF_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
SF_df_in_MDM['edited_p_thing'] = result
# Save updated data back to the original DataFrame
final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
result = ME_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
ME_df_in_MDM['edited_p_thing'] = result
final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
result = GE_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
GE_df_in_MDM['edited_p_thing'] = result
final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
# output final dataframe
final_dataframe.to_csv("post_processed_df.csv", index=False)
print("Saved output to post_processed_df.csv")
# %%
# print results
print("ShipBoiler post-processing results")
print_classification_metrics(updated_boiler_data)
print("----------------------------")
print("ShipBoiler post-processing results")
print_classification_metrics(ME_df_in_MDM)
print("----------------------------")
print("ShipBoiler post-processing results")
print_classification_metrics(GE_df_in_MDM)
print("----------------------------")
# %%

View File

@ -265,7 +265,7 @@ def run_deduplication(
print('generate train embeddings')
train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
tensor = train_embedder.make_embedding(checkpoint_path)
torch.save(tensor, file_path, weights_only=True)
torch.save(tensor, file_path)
print("Tensor saved to file.")
train_embeds = tensor

View File

@ -0,0 +1,158 @@
# Shaft# class post-processing
shaft_rules = [
{ "conditions": [
lambda x: "NO1" in x,
lambda x: "SHAFT" in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO1" in x,
lambda x: "Shaft" in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO2" in x,
lambda x: "Shaft" in x
],
"action": 'Shaft2'
},
{ "conditions": [
lambda x: "NO2" in x,
lambda x: "SHAFT" in x
],
"action": 'Shaft2'
},
{ "conditions": [
lambda x: "NO1" not in x,
lambda x: "NO2" not in x,
lambda x: "SHAFT" not in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO1" not in x,
lambda x: "NO2" not in x,
lambda x: "SHAFT" in x,
lambda x: "(P)" in x
],
"action": 'Shaft2'
},
{ "conditions": [
lambda x: "NO1" not in x,
lambda x: "NO2" not in x,
lambda x: "SHAFT" in x,
lambda x: "(S)" in x
],
"action": 'Shaft1'
},
{ "conditions": [
lambda x: "NO1" not in x,
lambda x: "NO2" not in x,
lambda x: "SHAFT" in x,
lambda x: "(S)" not in x,
lambda x: "(P)" not in x
],
"action": 'Shaft1'
},
]
# ME# class post-processing
ME_rules = [
{ "conditions": [
lambda x: "ME" in x,
lambda x: "(P)" not in x,
lambda x: "(S)" not in x,
lambda x: "GE" not in x,
lambda x: "FLOW" in x,
],
"action": 'ME1Flow'
},
{ "conditions": [
lambda x: "ME" in x,
lambda x: "(P)" in x,
lambda x: "FLOW" in x,
],
"action": 'ME2Flow'
},
{ "conditions": [
lambda x: "ME" in x,
lambda x: "(S)" in x,
lambda x: "FLOW" in x,
],
"action": 'ME1Flow'
},
{ "conditions": [
lambda x: "ME" not in x,
lambda x: "GE" not in x,
lambda x: "(P)" not in x,
lambda x: "(S)" not in x,
lambda x: "FLOW" in x,
],
"action": 'ME1Flow'
},
{ "conditions": [
lambda x: "ME" in x,
lambda x: "GE" not in x,
lambda x: "(P)" not in x,
lambda x: "(S)" not in x,
lambda x: "FLOW" not in x,
lambda x: "CONSUMPTION" in x,
],
"action": 'ME1Flow'
},
]
# GEFlow rules
GEFlow_rules = [
{ "conditions": [
lambda x: "NO" not in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE1Flow'
},
{ "conditions": [
lambda x: "NO1" in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE1Flow'
},
{ "conditions": [
lambda x: "NO2" in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE2Flow'
},
{ "conditions": [
lambda x: "NO3" in x,
lambda x: "GE" in x,
lambda x: "MGO" in x,
],
"action": 'GE3Flow'
},
{ "conditions": [
lambda x: "NO" not in x,
lambda x: "GE" in x,
lambda x: "CONSUMPTION" in x,
],
"action": 'GE1Flow'
},
]

View File

@ -1,6 +1,6 @@
# %%
import re
from replacement_dict import desc_replacement_dict, unit_replacement_dict
from end_to_end.replacement_dict import desc_replacement_dict, unit_replacement_dict
class Abbreviator:

View File

@ -0,0 +1,122 @@
import os
import re
import pandas as pd
from end_to_end.post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
class Corrector():
def __init__(self, df):
# copy over the existing
df['edited_p_thing'] = df['p_thing'].copy()
self.df = df
def _process_boiler_data(self, boiler_data):
updated_dataframes = []
for _, group in boiler_data.groupby('ships_idx'):
group_copy = group.copy()
group_copy["tag_description"] = group_copy["tag_description"].fillna('')
contains_no1_cond = group_copy["tag_description"].str.contains("NO1")
contains_no2_cond = group_copy["tag_description"].str.contains("NO2")
contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO")
contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
if max_boiler_number > 3:
max_boiler_number = 3
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
else:
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
updated_dataframes.append(group_copy) # Collect updated group
# Step 2: Concatenate all updated groups
if (len(updated_dataframes) == 0):
return boiler_data
updated_boiler_data = pd.concat(updated_dataframes)
return updated_boiler_data
def _check_conditions(self, value, conditions):
# Check if a value satisfies all conditions
return all(condition(value) for condition in conditions)
def _apply_rules(self, description, thing, rules):
#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
for rule in rules:
if self._check_conditions(description, rule["conditions"]): #Check that all conditions are met
return rule["action"] #Execute the action and return the result
return thing #Returns the value of the thing column if it doesn't match any of the rules
def run_correction(self):
final_dataframe = self.df.copy()
# Hwanggawi main function
#Get partial columns
TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
#Shaft
SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
SF_df_in_MDM = SF_df[(SF_df['MDM'])]
#ME
ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|
TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
ME_df_in_MDM = ME_df[(ME_df['MDM'])]
#GE
GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|
TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
GE_df_in_MDM = GE_df[(GE_df['MDM'])]
SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
# ShipBoiler class post-processing
mdm = final_dataframe[final_dataframe["MDM"]].copy()
boiler_data = mdm[mdm["thing"].str.contains("BOILER")].copy()
# blr_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
# boiler_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
# boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
# different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
# unique_ships_idxs = boiler_data["ships_idx"].unique()
boiler_data["edited_p_thing"] = boiler_data["p_thing"]
updated_boiler_data = self._process_boiler_data(boiler_data)
final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
result = SF_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
SF_df_in_MDM['edited_p_thing'] = result
final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
result = ME_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
ME_df_in_MDM['edited_p_thing'] = result
final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
result = GE_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
GE_df_in_MDM['edited_p_thing'] = result
final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
# override p_thing with edited_p_thing
final_dataframe['p_thing'] = final_dataframe['edited_p_thing'].copy()
return final_dataframe

View File

@ -2,13 +2,16 @@
import pandas as pd
import os
import glob
from mapper import Mapper
from preprocess import Abbreviator
from deduplication import run_deduplication
from end_to_end.mapper import Mapper
from end_to_end.preprocess import Abbreviator
from end_to_end.deduplication import run_deduplication
from end_to_end.rule_based_correction import Corrector
# global config
BATCH_SIZE = 512
SHIPS_LIST = [1000,1001,1003,1004]
# SHIPS_LIST = [1000]
# %%
# START: we import the raw data csv and extract only a few ships from it to simulate incoming json
@ -47,6 +50,12 @@ df_out = pd.DataFrame({
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
df = pd.concat([df, df_out], axis=1)
# %%
###################################
# run rule-based correction
corrector = Corrector(df)
df = corrector.run_correction()
# %%
####################################
@ -59,7 +68,7 @@ df = run_deduplication(
test_df=df,
train_df=train_df,
batch_size=BATCH_SIZE,
threshold=0.85,
threshold=0.9,
diagnostic=True)
# %%

View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log/

View File

@ -0,0 +1,196 @@
# %%
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from transformers import (
T5TokenizerFast,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
EarlyStoppingCallback,
Seq2SeqTrainingArguments
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# outputs a list of dictionaries
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
}
output_list.append(element)
return output_list
def create_split_dataset():
# train
data_path = "../../data_preprocess/exports/preprocessed_data.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
train_df = train_df[train_df['MDM']].reset_index(drop=True)
# valid
data_path = "../../data_preprocess/exports/dataset/group_1/valid.csv"
validation_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df)),
'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
})
return combined_data
# function to perform training for a given fold
def train():
save_path = 'checkpoint'
split_datasets = create_split_dataset()
# prepare tokenizer
model_checkpoint = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['input']
target = example['output']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
text_target=target,
max_length=max_length,
truncation=True,
padding=True
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns=split_datasets["train"].column_names,
)
# https://github.com/huggingface/transformers/pull/28414
# model_checkpoint = "google/t5-efficient-tiny"
# device_map set to auto to force it to load contiguous weights
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")
def compute_metrics(eval_preds):
preds, labels = eval_preds
# In case the model returns more than the prediction logits
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds,
skip_special_tokens=False)
# Replace -100s in the labels as we can't decode them
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels,
skip_special_tokens=False)
# Remove <PAD> tokens from decoded predictions and labels
decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
# Some simple post-processing
# decoded_preds = [pred.strip() for pred in decoded_preds]
# decoded_labels = [[label.strip()] for label in decoded_labels]
# print(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {"bleu": result["score"]}
# Generation Config
# from transformers import GenerationConfig
gen_config = model.generation_config
gen_config.max_length = 64
# compile
# model = torch.compile(model, backend="inductor", dynamic=True)
# Trainer
args = Seq2SeqTrainingArguments(
f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,
generation_config=gen_config,
remove_unused_columns=False,
)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
train()