Feat: added post_processing based on rules
others: - added basic data analysis to get histograms of text differences - added new final delivery model
This commit is contained in:
parent
481bcf88b7
commit
c5760d127d
|
@ -1 +1,2 @@
|
||||||
*.zip
|
*.zip
|
||||||
|
post_processor
|
|
@ -7,6 +7,14 @@ full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
mdm_list = sorted(list((set(full_df['pattern']))))
|
mdm_list = sorted(list((set(full_df['pattern']))))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
full_df
|
||||||
|
|
||||||
|
# %%
|
||||||
|
mask1 = full_df['thing'] == 'ME1TurboCharger1'
|
||||||
|
mask2 = full_df['property'] == 'LOInletPress'
|
||||||
|
mask = mask1 & mask2
|
||||||
|
full_df[mask]
|
||||||
# %%
|
# %%
|
||||||
len(mdm_list)
|
len(mdm_list)
|
||||||
# %%
|
# %%
|
||||||
|
@ -16,3 +24,12 @@ tp_list = sorted(list(set(thing_property)))
|
||||||
# %%
|
# %%
|
||||||
len(tp_list)
|
len(tp_list)
|
||||||
# %%
|
# %%
|
||||||
|
data_path = '../../data_import/exports/raw_data.csv'
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
bad_df = df[~df['MDM']]
|
||||||
|
# %%
|
||||||
|
bad_df[bad_df['thing'] == '$UNMAPPED']
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
|
@ -9,5 +9,5 @@ df = pd.read_csv(data_path)
|
||||||
df
|
df
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
set(df['signal_type'])
|
len(set(df['ships_idx']))
|
||||||
# %%
|
# %%
|
|
@ -0,0 +1,71 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
# note: we assume that you will execute from the directory of this code
|
||||||
|
# check your current directory
|
||||||
|
print("Current Working Directory:", os.getcwd())
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# plt.rcParams.update({'font.size': 18})
|
||||||
|
|
||||||
|
df = pd.read_csv('../../data_import/exports/raw_data.csv')
|
||||||
|
total_counts = df['ships_idx'].value_counts().sort_index()
|
||||||
|
|
||||||
|
mdm_true_counts = df[df['MDM']]['ships_idx'].value_counts().sort_index()
|
||||||
|
|
||||||
|
summary_df = pd.DataFrame({
|
||||||
|
'SD': total_counts,
|
||||||
|
'PD': mdm_true_counts
|
||||||
|
}).fillna(0)
|
||||||
|
|
||||||
|
total_SD = summary_df['SD'].sum()
|
||||||
|
total_PD = summary_df['PD'].sum()
|
||||||
|
|
||||||
|
print(f"Total SD: {total_SD}")
|
||||||
|
print(f"Total PD: {total_PD}")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
fig, ax = plt.subplots(figsize=(8, 6))
|
||||||
|
|
||||||
|
summary_df['SD'].plot(
|
||||||
|
kind='bar',
|
||||||
|
ax=ax,
|
||||||
|
color='orange',
|
||||||
|
alpha=0.5,
|
||||||
|
label='Ship Domain',
|
||||||
|
width=0.8)
|
||||||
|
|
||||||
|
summary_df['PD'].plot(
|
||||||
|
kind='bar',
|
||||||
|
ax=ax,
|
||||||
|
color='blue',
|
||||||
|
alpha=0.7,
|
||||||
|
label='Platform Domain',
|
||||||
|
width=0.8)
|
||||||
|
|
||||||
|
x_labels = ax.get_xticks()
|
||||||
|
ax.set_xticks(np.arange(min(x_labels), max(x_labels) + 1, 10))
|
||||||
|
ax.set_xticklabels(
|
||||||
|
[int(label) for label in np.arange(min(x_labels), max(x_labels) + 1, 10)],
|
||||||
|
rotation=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.grid(True)
|
||||||
|
|
||||||
|
# plt.legend(prop={'size': 18})
|
||||||
|
plt.legend()
|
||||||
|
plt.ylabel('Counts')
|
||||||
|
plt.xlabel('Ships')
|
||||||
|
|
||||||
|
plt.savefig('count_statistics_of_each_ship.png')
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,38 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
# note: we assume that you will execute from the directory of this code
|
||||||
|
# check your current directory
|
||||||
|
print("Current Working Directory:", os.getcwd())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = "../../data_preprocess/exports/combined_group_allocation.csv"
|
||||||
|
df = pd.read_csv(data_path)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df
|
||||||
|
# %%
|
||||||
|
print('mean', df[df['Allocation'] == 'BGKF']['Comb_count'].mean())
|
||||||
|
print('std', df[df['Allocation'] == 'BGKF']['Comb_count'].std())
|
||||||
|
max = df[df['Allocation'] == 'BGKF']['Comb_count'].max()
|
||||||
|
min = df[df['Allocation'] == 'BGKF']['Comb_count'].min()
|
||||||
|
print('max', max)
|
||||||
|
print('max', min)
|
||||||
|
print('max - min', max - min)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print('mean', df[df['Allocation'] == 'GKF']['Comb_count'].mean())
|
||||||
|
print('std', df[df['Allocation'] == 'GKF']['Comb_count'].std())
|
||||||
|
max = df[df['Allocation'] == 'GKF']['Comb_count'].max()
|
||||||
|
min = df[df['Allocation'] == 'GKF']['Comb_count'].min()
|
||||||
|
print('max', max)
|
||||||
|
print('max', min)
|
||||||
|
print('max - min', max - min)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,72 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import Levenshtein
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
mdm_list = sorted(list((set(df['pattern']))))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df['thing_property'] = df['thing'] + df['property']
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def compute_norm_leven(string1, string2):
|
||||||
|
max_distance = max(len(string1), len(string2))
|
||||||
|
leven_distance = Levenshtein.distance(string1, string2)
|
||||||
|
norm_leven = leven_distance / max_distance
|
||||||
|
return norm_leven
|
||||||
|
|
||||||
|
# %%
|
||||||
|
n = len(df)
|
||||||
|
distance_array = np.zeros((n), dtype=float)
|
||||||
|
|
||||||
|
desc_array = df['tag_description']
|
||||||
|
thing_property_array = df['thing_property']
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute normalized levenshtein distance
|
||||||
|
for i in tqdm(range(n)):
|
||||||
|
string1 = desc_array[i]
|
||||||
|
string2 = thing_property_array[i]
|
||||||
|
distance_array[i] = compute_norm_leven(string1, string2)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
distance_array
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||||
|
plt.xlabel("Normalized Levenshtein Distance")
|
||||||
|
plt.ylabel("Count")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig("histogram.png", dpi=300)
|
||||||
|
#
|
||||||
|
# %%
|
||||||
|
# summary statistics of computed levenshtein distance
|
||||||
|
def summary_stats(arr):
|
||||||
|
return {
|
||||||
|
"Mean": np.mean(arr),
|
||||||
|
"Median": np.median(arr),
|
||||||
|
"Standard Deviation": np.std(arr),
|
||||||
|
"Variance": np.var(arr),
|
||||||
|
"Min": np.min(arr),
|
||||||
|
"Max": np.max(arr),
|
||||||
|
"Range": np.ptp(arr),
|
||||||
|
"25th Percentile": np.percentile(arr, 25),
|
||||||
|
"75th Percentile": np.percentile(arr, 75),
|
||||||
|
"Sum": np.sum(arr),
|
||||||
|
}
|
||||||
|
|
||||||
|
stats = summary_stats(distance_array)
|
||||||
|
|
||||||
|
for key, value in stats.items():
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,88 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import Levenshtein
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
df['thing_property'] = df['thing'] + df['property']
|
||||||
|
mdm_list = sorted(list((set(df['thing_property']))))
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def compute_norm_leven(string1, string2):
|
||||||
|
max_distance = max(len(string1), len(string2))
|
||||||
|
leven_distance = Levenshtein.distance(string1, string2)
|
||||||
|
norm_leven = leven_distance / max_distance
|
||||||
|
return norm_leven
|
||||||
|
|
||||||
|
def compute_avg_score(strings):
|
||||||
|
n = len(strings)
|
||||||
|
|
||||||
|
# if group only has 1 string, then it is fully similar to itself
|
||||||
|
if n == 1:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Create an empty matrix
|
||||||
|
distance_matrix = np.zeros((n, n), dtype=float)
|
||||||
|
|
||||||
|
# Fill only the upper triangular part
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
dist = compute_norm_leven(strings[i], strings[j])
|
||||||
|
distance_matrix[i, j] = dist
|
||||||
|
|
||||||
|
upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
|
||||||
|
mean_distance = np.mean(upper_triangular_distances)
|
||||||
|
return mean_distance
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# we want to subset to each class
|
||||||
|
n = len(mdm_list)
|
||||||
|
score_list = np.zeros((n), dtype=float)
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
df_subset = df[df['thing_property'] == mdm_list[i]]
|
||||||
|
strings = df_subset['tag_description'].to_numpy()
|
||||||
|
score_list[i] = compute_avg_score(strings)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
score_list
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# plt.hist(score_list, bins=50)
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||||
|
plt.xlabel("Normalized Levenshtein Distance")
|
||||||
|
plt.ylabel("Platform Domain Class Count")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig("histogram.png", dpi=300)
|
||||||
|
# %%
|
||||||
|
# summary statistics of computed levenshtein distance
|
||||||
|
def summary_stats(arr):
|
||||||
|
return {
|
||||||
|
"Mean": np.mean(arr),
|
||||||
|
"Median": np.median(arr),
|
||||||
|
"Standard Deviation": np.std(arr),
|
||||||
|
"Variance": np.var(arr),
|
||||||
|
"Min": np.min(arr),
|
||||||
|
"Max": np.max(arr),
|
||||||
|
"Range": np.ptp(arr),
|
||||||
|
"25th Percentile": np.percentile(arr, 25),
|
||||||
|
"75th Percentile": np.percentile(arr, 75),
|
||||||
|
"Sum": np.sum(arr),
|
||||||
|
}
|
||||||
|
|
||||||
|
stats = summary_stats(score_list)
|
||||||
|
|
||||||
|
for key, value in stats.items():
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
# %%
|
|
@ -181,6 +181,7 @@ remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
|
||||||
|
|
||||||
# Allocate remaining ships to the groups
|
# Allocate remaining ships to the groups
|
||||||
while len(remaining_ships) > 0:
|
while len(remaining_ships) > 0:
|
||||||
|
# re-compute the counts for each group
|
||||||
group_comb_counts = []
|
group_comb_counts = []
|
||||||
for g in range(num_groups):
|
for g in range(num_groups):
|
||||||
group_ships = groups[g]
|
group_ships = groups[g]
|
||||||
|
@ -190,18 +191,21 @@ while len(remaining_ships) > 0:
|
||||||
|
|
||||||
group_comb_counts.sort(key=lambda x: x[1])
|
group_comb_counts.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
|
# reset the remaining_group list
|
||||||
remaining_group = []
|
remaining_group = []
|
||||||
|
# g is the identifier for the group
|
||||||
for g, _ in group_comb_counts:
|
for g, _ in group_comb_counts:
|
||||||
if len(remaining_ships) == 0:
|
if len(remaining_ships) == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# compute for each group, the selected ship, and the combined count increase
|
||||||
if group_comb_counts.index((g, _)) == 0:
|
if group_comb_counts.index((g, _)) == 0:
|
||||||
selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
|
selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
|
max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
|
||||||
selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
|
selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
|
||||||
|
|
||||||
|
# if the combined increase is 0, then we process it in a special manner
|
||||||
if comb_increase == 0:
|
if comb_increase == 0:
|
||||||
remaining_group.append(g)
|
remaining_group.append(g)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# %%
|
# %%
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
|
from inference import Inference, Embedder_t5_encoder
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.manifold import TSNE
|
from sklearn.manifold import TSNE
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
*.csv
|
||||||
|
fold_*
|
||||||
|
__pycache__
|
|
@ -0,0 +1,154 @@
|
||||||
|
# Shaft# class post-processing
|
||||||
|
shaft_rules = [
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.1" in x,
|
||||||
|
lambda x: "SHAFT" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.1" in x,
|
||||||
|
lambda x: "Shaft" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.2" in x,
|
||||||
|
lambda x: "Shaft" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft2'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.2" in x,
|
||||||
|
lambda x: "SHAFT" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft2'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.1" not in x,
|
||||||
|
lambda x: "NO.2" not in x,
|
||||||
|
lambda x: "SHAFT" not in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.1" not in x,
|
||||||
|
lambda x: "NO.2" not in x,
|
||||||
|
lambda x: "SHAFT" in x,
|
||||||
|
lambda x: "(P)" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft2'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.1" not in x,
|
||||||
|
lambda x: "NO.2" not in x,
|
||||||
|
lambda x: "SHAFT" in x,
|
||||||
|
lambda x: "(S)" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO.1" not in x,
|
||||||
|
lambda x: "NO.2" not in x,
|
||||||
|
lambda x: "SHAFT" in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "(P)" not in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ME# class post-processing
|
||||||
|
ME_rules = [
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "(P)" not in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "GE" not in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "(P)" in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME2Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "(S)" in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" not in x,
|
||||||
|
lambda x: "GE" not in x,
|
||||||
|
lambda x: "(P)" not in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "GE" not in x,
|
||||||
|
lambda x: "(P)" not in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "FLOW" not in x,
|
||||||
|
lambda x: "CONSUMPTION" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# GEFlow rules
|
||||||
|
GEFlow_rules = [
|
||||||
|
{ "conditions": [lambda x: "NO." not in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE1Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [ lambda x: "NO.1" in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE1Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [ lambda x: "NO.2" in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE2Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [ lambda x: "NO.3" in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE3Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [ lambda x: "NO." not in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "CONSUMPTION" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE1Flow'
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,233 @@
|
||||||
|
# %%
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.metrics import classification_report
|
||||||
|
|
||||||
|
from post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Function to print classification metrics
|
||||||
|
def print_classification_metrics(df):
|
||||||
|
print("Results before post-processing")
|
||||||
|
# print(classification_report(df["thing"], df["p_thing"].fillna("")))
|
||||||
|
report = classification_report(df["thing"], df["p_thing"].fillna(""), output_dict=True)
|
||||||
|
# Extracting the weighted average values for precision, recall, and F1-score
|
||||||
|
precision_avg = report['weighted avg']['precision']
|
||||||
|
recall_avg = report['weighted avg']['recall']
|
||||||
|
f1_avg = report['weighted avg']['f1-score']
|
||||||
|
# Print the averages
|
||||||
|
print("Average Precision:", precision_avg)
|
||||||
|
print("Average Recall:", recall_avg)
|
||||||
|
print("Average F1-Score:", f1_avg)
|
||||||
|
|
||||||
|
print("**************")
|
||||||
|
|
||||||
|
print("Results after post-processing")
|
||||||
|
# print(classification_report(df["thing"], df["edited_p_thing"].fillna("")))
|
||||||
|
report = classification_report(df["thing"], df["edited_p_thing"].fillna(""), output_dict=True)
|
||||||
|
# Extracting the weighted average values for precision, recall, and F1-score
|
||||||
|
precision_avg = report['weighted avg']['precision']
|
||||||
|
recall_avg = report['weighted avg']['recall']
|
||||||
|
f1_avg = report['weighted avg']['f1-score']
|
||||||
|
# Print the averages
|
||||||
|
print("Average Precision:", precision_avg)
|
||||||
|
print("Average Recall:", recall_avg)
|
||||||
|
print("Average F1-Score:", f1_avg)
|
||||||
|
|
||||||
|
|
||||||
|
def read_data():
|
||||||
|
BASE_FOLDER = "fold"
|
||||||
|
BASE_FILE_NAME = "result_group"
|
||||||
|
NUM_FOLDS = 5
|
||||||
|
|
||||||
|
# List for storing all DataFrames
|
||||||
|
dataframes = []
|
||||||
|
|
||||||
|
# Iterate over all fold
|
||||||
|
for i in range(NUM_FOLDS):
|
||||||
|
fold_folder = f"{BASE_FOLDER}_{i+1}"
|
||||||
|
desired_file_name = f"{BASE_FILE_NAME}_{i+1}.csv"
|
||||||
|
for file_name in os.listdir(fold_folder):
|
||||||
|
if file_name==desired_file_name:
|
||||||
|
file_path = os.path.join(fold_folder, file_name)
|
||||||
|
|
||||||
|
df = pd.read_csv(file_path, index_col=0)
|
||||||
|
dataframes.append(df)
|
||||||
|
|
||||||
|
# Combine all DataFrames into one
|
||||||
|
final_dataframe = pd.concat(dataframes)
|
||||||
|
final_dataframe = final_dataframe[final_dataframe["MDM"]].reset_index(drop=True)
|
||||||
|
# assign a copy
|
||||||
|
final_dataframe["edited_p_thing"] = final_dataframe["p_thing"]
|
||||||
|
return final_dataframe
|
||||||
|
|
||||||
|
|
||||||
|
def update_shipboiler_p_thing(df, tag_column='tag_description', thing_pred_column='edited_p_thing'):
|
||||||
|
"""
|
||||||
|
Update the 'thing' column in the DataFrame based on rules applied to 'tag_description' column.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df: DataFrame to apply the function on.
|
||||||
|
- tag_column: Column name containing descriptions to base the logic on.
|
||||||
|
- thing_pred_column: Column to be updated with new values based on conditions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- DataFrame with updated 'thing' column.
|
||||||
|
"""
|
||||||
|
# Fill NaN values in tag column to avoid errors with ~ operator
|
||||||
|
df[tag_column] = df[tag_column].fillna('')
|
||||||
|
|
||||||
|
# Apply rules for "NO.1", "NO.2", and "AUXILIARY" in tag_column
|
||||||
|
df.loc[df[tag_column].str.contains("NO.1", case=False), thing_pred_column] = "ShipBoiler1"
|
||||||
|
df.loc[df[tag_column].str.contains("NO.2", case=False), thing_pred_column] = "ShipBoiler2"
|
||||||
|
df.loc[(df[tag_column].str.contains("AUXILIARY", case=False)) & (~df[tag_column].str.contains("NO.", case=False)), thing_pred_column] = "ShipBoiler1"
|
||||||
|
|
||||||
|
# Determine the highest number for "thing" in rows without "COMPOSITE" and assign to "COMPOSITE" rows
|
||||||
|
max_boiler_number = df.loc[~df[tag_column].str.contains("COMPOSITE", case=False) & df[thing_pred_column].str.contains("ShipBoiler", na=False), thing_pred_column] \
|
||||||
|
.str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
|
||||||
|
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
|
||||||
|
|
||||||
|
# Apply composite_boiler to rows containing "COMPOSITE"
|
||||||
|
df.loc[df[tag_column].str.contains("COMPOSITE", case=False), thing_pred_column] = composite_boiler
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def process_boiler_data(boiler_data):
|
||||||
|
updated_dataframes = []
|
||||||
|
for _, group in boiler_data.groupby('ships_idx'):
|
||||||
|
group_copy = group.copy()
|
||||||
|
|
||||||
|
group_copy["tag_description"] = group_copy["tag_description"].fillna('')
|
||||||
|
|
||||||
|
contains_no1_cond = group_copy["tag_description"].str.contains("NO.1")
|
||||||
|
contains_no2_cond = group_copy["tag_description"].str.contains("NO.2")
|
||||||
|
contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
|
||||||
|
doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO.")
|
||||||
|
contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
|
||||||
|
contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
|
||||||
|
|
||||||
|
group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||||
|
group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
|
||||||
|
group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||||
|
|
||||||
|
if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
|
||||||
|
max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
|
||||||
|
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
|
||||||
|
|
||||||
|
if max_boiler_number > 3:
|
||||||
|
max_boiler_number = 3
|
||||||
|
|
||||||
|
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
|
||||||
|
else:
|
||||||
|
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
|
||||||
|
|
||||||
|
updated_dataframes.append(group_copy) # Collect updated group
|
||||||
|
|
||||||
|
# Step 2: Concatenate all updated groups
|
||||||
|
updated_boiler_data = pd.concat(updated_dataframes)
|
||||||
|
return updated_boiler_data
|
||||||
|
|
||||||
|
|
||||||
|
def check_conditions(value,conditions):
|
||||||
|
# Check if a value satisfies all conditions
|
||||||
|
return all(condition(value) for condition in conditions)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_rules(description, thing, rules):
|
||||||
|
#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
|
||||||
|
for rule in rules:
|
||||||
|
if check_conditions(description, rule["conditions"]): #Check that all conditions are met
|
||||||
|
return rule["action"] #Execute the action and return the result
|
||||||
|
return thing #Returns the value of the thing column if it doesn't match any of the rules
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Read and preprocess data
|
||||||
|
final_dataframe = read_data()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
final_dataframe
|
||||||
|
# %%
|
||||||
|
# Hwanggawi main function
|
||||||
|
#Get partial columns
|
||||||
|
TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
|
||||||
|
|
||||||
|
#Shaft
|
||||||
|
SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
|
||||||
|
SF_df_in_MDM = SF_df[(SF_df['MDM'])]
|
||||||
|
|
||||||
|
#ME
|
||||||
|
ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
|
||||||
|
TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
|
||||||
|
ME_df_in_MDM = ME_df[(ME_df['MDM'])]
|
||||||
|
|
||||||
|
#GE
|
||||||
|
GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
|
||||||
|
TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
|
||||||
|
GE_df_in_MDM = GE_df[(GE_df['MDM'])]
|
||||||
|
|
||||||
|
SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
|
||||||
|
GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
|
||||||
|
ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
|
||||||
|
|
||||||
|
# ShipBoiler class post-processing
|
||||||
|
mdm = final_dataframe[final_dataframe["MDM"]].copy()
|
||||||
|
boiler_data = mdm[mdm["thing"].str.contains("Boiler")].copy()
|
||||||
|
|
||||||
|
blr_cond = boiler_data["tag_description"].str.lower().str.contains("blr")
|
||||||
|
boiler_cond = boiler_data["tag_description"].str.lower().str.contains("boiler")
|
||||||
|
boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
|
||||||
|
different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
|
||||||
|
|
||||||
|
unique_ships_idxs = boiler_data["ships_idx"].unique()
|
||||||
|
|
||||||
|
boiler_data["edited_p_thing"] = boiler_data["p_thing"]
|
||||||
|
|
||||||
|
updated_boiler_data = process_boiler_data(boiler_data)
|
||||||
|
|
||||||
|
# Save updated data back to the original DataFrame
|
||||||
|
final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
|
||||||
|
|
||||||
|
|
||||||
|
result = SF_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
|
||||||
|
SF_df_in_MDM['edited_p_thing'] = result
|
||||||
|
|
||||||
|
# Save updated data back to the original DataFrame
|
||||||
|
final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
|
||||||
|
|
||||||
|
|
||||||
|
result = ME_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
|
||||||
|
ME_df_in_MDM['edited_p_thing'] = result
|
||||||
|
final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
|
||||||
|
|
||||||
|
|
||||||
|
result = GE_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
|
||||||
|
GE_df_in_MDM['edited_p_thing'] = result
|
||||||
|
final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
|
||||||
|
|
||||||
|
|
||||||
|
# output final dataframe
|
||||||
|
final_dataframe.to_csv("post_processed_df.csv", index=False)
|
||||||
|
print("Saved output to post_processed_df.csv")
|
||||||
|
# %%
|
||||||
|
# print results
|
||||||
|
print("ShipBoiler post-processing results")
|
||||||
|
print_classification_metrics(updated_boiler_data)
|
||||||
|
print("----------------------------")
|
||||||
|
|
||||||
|
print("ShipBoiler post-processing results")
|
||||||
|
print_classification_metrics(ME_df_in_MDM)
|
||||||
|
print("----------------------------")
|
||||||
|
|
||||||
|
print("ShipBoiler post-processing results")
|
||||||
|
print_classification_metrics(GE_df_in_MDM)
|
||||||
|
print("----------------------------")
|
||||||
|
|
||||||
|
# %%
|
|
@ -265,7 +265,7 @@ def run_deduplication(
|
||||||
print('generate train embeddings')
|
print('generate train embeddings')
|
||||||
train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
|
train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
|
||||||
tensor = train_embedder.make_embedding(checkpoint_path)
|
tensor = train_embedder.make_embedding(checkpoint_path)
|
||||||
torch.save(tensor, file_path, weights_only=True)
|
torch.save(tensor, file_path)
|
||||||
print("Tensor saved to file.")
|
print("Tensor saved to file.")
|
||||||
|
|
||||||
train_embeds = tensor
|
train_embeds = tensor
|
|
@ -0,0 +1,158 @@
|
||||||
|
# Shaft# class post-processing
|
||||||
|
shaft_rules = [
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" in x,
|
||||||
|
lambda x: "SHAFT" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" in x,
|
||||||
|
lambda x: "Shaft" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO2" in x,
|
||||||
|
lambda x: "Shaft" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft2'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO2" in x,
|
||||||
|
lambda x: "SHAFT" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft2'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" not in x,
|
||||||
|
lambda x: "NO2" not in x,
|
||||||
|
lambda x: "SHAFT" not in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" not in x,
|
||||||
|
lambda x: "NO2" not in x,
|
||||||
|
lambda x: "SHAFT" in x,
|
||||||
|
lambda x: "(P)" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft2'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" not in x,
|
||||||
|
lambda x: "NO2" not in x,
|
||||||
|
lambda x: "SHAFT" in x,
|
||||||
|
lambda x: "(S)" in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" not in x,
|
||||||
|
lambda x: "NO2" not in x,
|
||||||
|
lambda x: "SHAFT" in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "(P)" not in x
|
||||||
|
],
|
||||||
|
"action": 'Shaft1'
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ME# class post-processing
|
||||||
|
ME_rules = [
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "(P)" not in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "GE" not in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "(P)" in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME2Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "(S)" in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" not in x,
|
||||||
|
lambda x: "GE" not in x,
|
||||||
|
lambda x: "(P)" not in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "FLOW" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "ME" in x,
|
||||||
|
lambda x: "GE" not in x,
|
||||||
|
lambda x: "(P)" not in x,
|
||||||
|
lambda x: "(S)" not in x,
|
||||||
|
lambda x: "FLOW" not in x,
|
||||||
|
lambda x: "CONSUMPTION" in x,
|
||||||
|
],
|
||||||
|
"action": 'ME1Flow'
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# GEFlow rules
|
||||||
|
GEFlow_rules = [
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO" not in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
],
|
||||||
|
"action": 'GE1Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO1" in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE1Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO2" in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE2Flow'
|
||||||
|
},
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO3" in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "MGO" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE3Flow'
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "conditions": [
|
||||||
|
lambda x: "NO" not in x,
|
||||||
|
lambda x: "GE" in x,
|
||||||
|
lambda x: "CONSUMPTION" in x,
|
||||||
|
|
||||||
|
],
|
||||||
|
"action": 'GE1Flow'
|
||||||
|
},
|
||||||
|
]
|
|
@ -1,6 +1,6 @@
|
||||||
# %%
|
# %%
|
||||||
import re
|
import re
|
||||||
from replacement_dict import desc_replacement_dict, unit_replacement_dict
|
from end_to_end.replacement_dict import desc_replacement_dict, unit_replacement_dict
|
||||||
|
|
||||||
class Abbreviator:
|
class Abbreviator:
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from end_to_end.post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
|
||||||
|
|
||||||
|
class Corrector():
|
||||||
|
|
||||||
|
def __init__(self, df):
|
||||||
|
# copy over the existing
|
||||||
|
df['edited_p_thing'] = df['p_thing'].copy()
|
||||||
|
self.df = df
|
||||||
|
|
||||||
|
|
||||||
|
def _process_boiler_data(self, boiler_data):
|
||||||
|
updated_dataframes = []
|
||||||
|
for _, group in boiler_data.groupby('ships_idx'):
|
||||||
|
group_copy = group.copy()
|
||||||
|
|
||||||
|
group_copy["tag_description"] = group_copy["tag_description"].fillna('')
|
||||||
|
|
||||||
|
contains_no1_cond = group_copy["tag_description"].str.contains("NO1")
|
||||||
|
contains_no2_cond = group_copy["tag_description"].str.contains("NO2")
|
||||||
|
contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
|
||||||
|
doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO")
|
||||||
|
contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
|
||||||
|
contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
|
||||||
|
|
||||||
|
group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||||
|
group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
|
||||||
|
group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||||
|
|
||||||
|
if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
|
||||||
|
max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
|
||||||
|
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
|
||||||
|
|
||||||
|
if max_boiler_number > 3:
|
||||||
|
max_boiler_number = 3
|
||||||
|
|
||||||
|
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
|
||||||
|
else:
|
||||||
|
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
|
||||||
|
|
||||||
|
updated_dataframes.append(group_copy) # Collect updated group
|
||||||
|
|
||||||
|
# Step 2: Concatenate all updated groups
|
||||||
|
if (len(updated_dataframes) == 0):
|
||||||
|
return boiler_data
|
||||||
|
updated_boiler_data = pd.concat(updated_dataframes)
|
||||||
|
return updated_boiler_data
|
||||||
|
|
||||||
|
def _check_conditions(self, value, conditions):
|
||||||
|
# Check if a value satisfies all conditions
|
||||||
|
return all(condition(value) for condition in conditions)
|
||||||
|
|
||||||
|
def _apply_rules(self, description, thing, rules):
|
||||||
|
#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
|
||||||
|
for rule in rules:
|
||||||
|
if self._check_conditions(description, rule["conditions"]): #Check that all conditions are met
|
||||||
|
return rule["action"] #Execute the action and return the result
|
||||||
|
return thing #Returns the value of the thing column if it doesn't match any of the rules
|
||||||
|
|
||||||
|
def run_correction(self):
|
||||||
|
final_dataframe = self.df.copy()
|
||||||
|
# Hwanggawi main function
|
||||||
|
#Get partial columns
|
||||||
|
TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
|
||||||
|
|
||||||
|
#Shaft
|
||||||
|
SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
|
||||||
|
SF_df_in_MDM = SF_df[(SF_df['MDM'])]
|
||||||
|
|
||||||
|
#ME
|
||||||
|
ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
|
||||||
|
TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|
|
||||||
|
TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
|
||||||
|
ME_df_in_MDM = ME_df[(ME_df['MDM'])]
|
||||||
|
|
||||||
|
#GE
|
||||||
|
GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
|
||||||
|
TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|
|
||||||
|
TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
|
||||||
|
GE_df_in_MDM = GE_df[(GE_df['MDM'])]
|
||||||
|
|
||||||
|
SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
|
||||||
|
GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
|
||||||
|
ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
|
||||||
|
|
||||||
|
# ShipBoiler class post-processing
|
||||||
|
mdm = final_dataframe[final_dataframe["MDM"]].copy()
|
||||||
|
boiler_data = mdm[mdm["thing"].str.contains("BOILER")].copy()
|
||||||
|
|
||||||
|
# blr_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
|
||||||
|
# boiler_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
|
||||||
|
# boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
|
||||||
|
|
||||||
|
# different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
|
||||||
|
|
||||||
|
# unique_ships_idxs = boiler_data["ships_idx"].unique()
|
||||||
|
|
||||||
|
boiler_data["edited_p_thing"] = boiler_data["p_thing"]
|
||||||
|
updated_boiler_data = self._process_boiler_data(boiler_data)
|
||||||
|
final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
|
||||||
|
|
||||||
|
result = SF_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
|
||||||
|
SF_df_in_MDM['edited_p_thing'] = result
|
||||||
|
final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
|
||||||
|
|
||||||
|
result = ME_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
|
||||||
|
ME_df_in_MDM['edited_p_thing'] = result
|
||||||
|
final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
|
||||||
|
|
||||||
|
|
||||||
|
result = GE_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
|
||||||
|
GE_df_in_MDM['edited_p_thing'] = result
|
||||||
|
final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
|
||||||
|
|
||||||
|
# override p_thing with edited_p_thing
|
||||||
|
final_dataframe['p_thing'] = final_dataframe['edited_p_thing'].copy()
|
||||||
|
|
||||||
|
return final_dataframe
|
|
@ -2,13 +2,16 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import glob
|
import glob
|
||||||
from mapper import Mapper
|
from end_to_end.mapper import Mapper
|
||||||
from preprocess import Abbreviator
|
from end_to_end.preprocess import Abbreviator
|
||||||
from deduplication import run_deduplication
|
from end_to_end.deduplication import run_deduplication
|
||||||
|
from end_to_end.rule_based_correction import Corrector
|
||||||
|
|
||||||
|
|
||||||
# global config
|
# global config
|
||||||
BATCH_SIZE = 512
|
BATCH_SIZE = 512
|
||||||
SHIPS_LIST = [1000,1001,1003,1004]
|
SHIPS_LIST = [1000,1001,1003,1004]
|
||||||
|
# SHIPS_LIST = [1000]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# START: we import the raw data csv and extract only a few ships from it to simulate incoming json
|
# START: we import the raw data csv and extract only a few ships from it to simulate incoming json
|
||||||
|
@ -47,6 +50,12 @@ df_out = pd.DataFrame({
|
||||||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||||
df = pd.concat([df, df_out], axis=1)
|
df = pd.concat([df, df_out], axis=1)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
###################################
|
||||||
|
# run rule-based correction
|
||||||
|
corrector = Corrector(df)
|
||||||
|
df = corrector.run_correction()
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
####################################
|
####################################
|
||||||
|
@ -59,7 +68,7 @@ df = run_deduplication(
|
||||||
test_df=df,
|
test_df=df,
|
||||||
train_df=train_df,
|
train_df=train_df,
|
||||||
batch_size=BATCH_SIZE,
|
batch_size=BATCH_SIZE,
|
||||||
threshold=0.85,
|
threshold=0.9,
|
||||||
diagnostic=True)
|
diagnostic=True)
|
||||||
|
|
||||||
# %%
|
# %%
|
|
@ -0,0 +1,2 @@
|
||||||
|
checkpoint*
|
||||||
|
tensorboard-log/
|
|
@ -0,0 +1,196 @@
|
||||||
|
# %%
|
||||||
|
|
||||||
|
# from datasets import load_from_disk
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||||
|
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import (
|
||||||
|
T5TokenizerFast,
|
||||||
|
AutoModelForSeq2SeqLM,
|
||||||
|
DataCollatorForSeq2Seq,
|
||||||
|
Seq2SeqTrainer,
|
||||||
|
EarlyStoppingCallback,
|
||||||
|
Seq2SeqTrainingArguments
|
||||||
|
)
|
||||||
|
import evaluate
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
def process_df_to_dict(df):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
element = {
|
||||||
|
'input' : f"{desc}{unit}",
|
||||||
|
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
|
||||||
|
def create_split_dataset():
|
||||||
|
# train
|
||||||
|
data_path = "../../data_preprocess/exports/preprocessed_data.csv"
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
train_df = train_df[train_df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
# valid
|
||||||
|
data_path = "../../data_preprocess/exports/dataset/group_1/valid.csv"
|
||||||
|
validation_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
combined_data = DatasetDict({
|
||||||
|
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||||
|
'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
|
||||||
|
})
|
||||||
|
return combined_data
|
||||||
|
|
||||||
|
|
||||||
|
# function to perform training for a given fold
|
||||||
|
def train():
|
||||||
|
save_path = 'checkpoint'
|
||||||
|
split_datasets = create_split_dataset()
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
|
||||||
|
model_checkpoint = "t5-small"
|
||||||
|
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
max_length = 120
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
|
def preprocess_function(example):
|
||||||
|
input = example['input']
|
||||||
|
target = example['output']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
|
input,
|
||||||
|
text_target=target,
|
||||||
|
max_length=max_length,
|
||||||
|
truncation=True,
|
||||||
|
padding=True
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
tokenized_datasets = split_datasets.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns=split_datasets["train"].column_names,
|
||||||
|
)
|
||||||
|
|
||||||
|
# https://github.com/huggingface/transformers/pull/28414
|
||||||
|
# model_checkpoint = "google/t5-efficient-tiny"
|
||||||
|
# device_map set to auto to force it to load contiguous weights
|
||||||
|
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||||
|
|
||||||
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||||
|
metric = evaluate.load("sacrebleu")
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(eval_preds):
|
||||||
|
preds, labels = eval_preds
|
||||||
|
# In case the model returns more than the prediction logits
|
||||||
|
if isinstance(preds, tuple):
|
||||||
|
preds = preds[0]
|
||||||
|
|
||||||
|
decoded_preds = tokenizer.batch_decode(preds,
|
||||||
|
skip_special_tokens=False)
|
||||||
|
|
||||||
|
# Replace -100s in the labels as we can't decode them
|
||||||
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||||
|
decoded_labels = tokenizer.batch_decode(labels,
|
||||||
|
skip_special_tokens=False)
|
||||||
|
|
||||||
|
# Remove <PAD> tokens from decoded predictions and labels
|
||||||
|
decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
|
||||||
|
decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
|
||||||
|
|
||||||
|
# Some simple post-processing
|
||||||
|
# decoded_preds = [pred.strip() for pred in decoded_preds]
|
||||||
|
# decoded_labels = [[label.strip()] for label in decoded_labels]
|
||||||
|
# print(decoded_preds, decoded_labels)
|
||||||
|
|
||||||
|
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
|
||||||
|
return {"bleu": result["score"]}
|
||||||
|
|
||||||
|
|
||||||
|
# Generation Config
|
||||||
|
# from transformers import GenerationConfig
|
||||||
|
gen_config = model.generation_config
|
||||||
|
gen_config.max_length = 64
|
||||||
|
|
||||||
|
# compile
|
||||||
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Trainer
|
||||||
|
|
||||||
|
args = Seq2SeqTrainingArguments(
|
||||||
|
f"{save_path}",
|
||||||
|
# eval_strategy="epoch",
|
||||||
|
eval_strategy="no",
|
||||||
|
logging_dir="tensorboard-log",
|
||||||
|
logging_strategy="epoch",
|
||||||
|
# save_strategy="epoch",
|
||||||
|
load_best_model_at_end=False,
|
||||||
|
learning_rate=1e-3,
|
||||||
|
per_device_train_batch_size=128,
|
||||||
|
per_device_eval_batch_size=128,
|
||||||
|
auto_find_batch_size=False,
|
||||||
|
ddp_find_unused_parameters=False,
|
||||||
|
weight_decay=0.01,
|
||||||
|
save_total_limit=1,
|
||||||
|
num_train_epochs=80,
|
||||||
|
predict_with_generate=True,
|
||||||
|
bf16=True,
|
||||||
|
push_to_hub=False,
|
||||||
|
generation_config=gen_config,
|
||||||
|
remove_unused_columns=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
trainer = Seq2SeqTrainer(
|
||||||
|
model,
|
||||||
|
args,
|
||||||
|
train_dataset=tokenized_datasets["train"],
|
||||||
|
eval_dataset=tokenized_datasets["validation"],
|
||||||
|
data_collator=data_collator,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||||
|
)
|
||||||
|
|
||||||
|
# uncomment to load training from checkpoint
|
||||||
|
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||||
|
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
# execute training
|
||||||
|
train()
|
||||||
|
|
Loading…
Reference in New Issue