Feat: added post_processing based on rules
others: - added basic data analysis to get histograms of text differences - added new final delivery model
This commit is contained in:
parent
481bcf88b7
commit
c5760d127d
|
@ -1 +1,2 @@
|
|||
*.zip
|
||||
*.zip
|
||||
post_processor
|
|
@ -7,6 +7,14 @@ full_df = pd.read_csv(data_path, skipinitialspace=True)
|
|||
mdm_list = sorted(list((set(full_df['pattern']))))
|
||||
|
||||
|
||||
# %%
|
||||
full_df
|
||||
|
||||
# %%
|
||||
mask1 = full_df['thing'] == 'ME1TurboCharger1'
|
||||
mask2 = full_df['property'] == 'LOInletPress'
|
||||
mask = mask1 & mask2
|
||||
full_df[mask]
|
||||
# %%
|
||||
len(mdm_list)
|
||||
# %%
|
||||
|
@ -16,3 +24,12 @@ tp_list = sorted(list(set(thing_property)))
|
|||
# %%
|
||||
len(tp_list)
|
||||
# %%
|
||||
data_path = '../../data_import/exports/raw_data.csv'
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# %%
|
||||
bad_df = df[~df['MDM']]
|
||||
# %%
|
||||
bad_df[bad_df['thing'] == '$UNMAPPED']
|
||||
|
||||
# %%
|
||||
|
|
|
@ -9,5 +9,5 @@ df = pd.read_csv(data_path)
|
|||
df
|
||||
|
||||
# %%
|
||||
set(df['signal_type'])
|
||||
len(set(df['ships_idx']))
|
||||
# %%
|
|
@ -0,0 +1,71 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
# note: we assume that you will execute from the directory of this code
|
||||
# check your current directory
|
||||
print("Current Working Directory:", os.getcwd())
|
||||
|
||||
|
||||
# %%
|
||||
# plt.rcParams.update({'font.size': 18})
|
||||
|
||||
df = pd.read_csv('../../data_import/exports/raw_data.csv')
|
||||
total_counts = df['ships_idx'].value_counts().sort_index()
|
||||
|
||||
mdm_true_counts = df[df['MDM']]['ships_idx'].value_counts().sort_index()
|
||||
|
||||
summary_df = pd.DataFrame({
|
||||
'SD': total_counts,
|
||||
'PD': mdm_true_counts
|
||||
}).fillna(0)
|
||||
|
||||
total_SD = summary_df['SD'].sum()
|
||||
total_PD = summary_df['PD'].sum()
|
||||
|
||||
print(f"Total SD: {total_SD}")
|
||||
print(f"Total PD: {total_PD}")
|
||||
|
||||
# %%
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
fig, ax = plt.subplots(figsize=(8, 6))
|
||||
|
||||
summary_df['SD'].plot(
|
||||
kind='bar',
|
||||
ax=ax,
|
||||
color='orange',
|
||||
alpha=0.5,
|
||||
label='Ship Domain',
|
||||
width=0.8)
|
||||
|
||||
summary_df['PD'].plot(
|
||||
kind='bar',
|
||||
ax=ax,
|
||||
color='blue',
|
||||
alpha=0.7,
|
||||
label='Platform Domain',
|
||||
width=0.8)
|
||||
|
||||
x_labels = ax.get_xticks()
|
||||
ax.set_xticks(np.arange(min(x_labels), max(x_labels) + 1, 10))
|
||||
ax.set_xticklabels(
|
||||
[int(label) for label in np.arange(min(x_labels), max(x_labels) + 1, 10)],
|
||||
rotation=0,
|
||||
)
|
||||
|
||||
ax.grid(True)
|
||||
|
||||
# plt.legend(prop={'size': 18})
|
||||
plt.legend()
|
||||
plt.ylabel('Counts')
|
||||
plt.xlabel('Ships')
|
||||
|
||||
plt.savefig('count_statistics_of_each_ship.png')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,38 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
# note: we assume that you will execute from the directory of this code
|
||||
# check your current directory
|
||||
print("Current Working Directory:", os.getcwd())
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
data_path = "../../data_preprocess/exports/combined_group_allocation.csv"
|
||||
df = pd.read_csv(data_path)
|
||||
|
||||
# %%
|
||||
df
|
||||
# %%
|
||||
print('mean', df[df['Allocation'] == 'BGKF']['Comb_count'].mean())
|
||||
print('std', df[df['Allocation'] == 'BGKF']['Comb_count'].std())
|
||||
max = df[df['Allocation'] == 'BGKF']['Comb_count'].max()
|
||||
min = df[df['Allocation'] == 'BGKF']['Comb_count'].min()
|
||||
print('max', max)
|
||||
print('max', min)
|
||||
print('max - min', max - min)
|
||||
|
||||
# %%
|
||||
print('mean', df[df['Allocation'] == 'GKF']['Comb_count'].mean())
|
||||
print('std', df[df['Allocation'] == 'GKF']['Comb_count'].std())
|
||||
max = df[df['Allocation'] == 'GKF']['Comb_count'].max()
|
||||
min = df[df['Allocation'] == 'GKF']['Comb_count'].min()
|
||||
print('max', max)
|
||||
print('max', min)
|
||||
print('max - min', max - min)
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,72 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import Levenshtein
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# %%
|
||||
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
mdm_list = sorted(list((set(df['pattern']))))
|
||||
|
||||
|
||||
# %%
|
||||
df['thing_property'] = df['thing'] + df['property']
|
||||
|
||||
# %%
|
||||
def compute_norm_leven(string1, string2):
|
||||
max_distance = max(len(string1), len(string2))
|
||||
leven_distance = Levenshtein.distance(string1, string2)
|
||||
norm_leven = leven_distance / max_distance
|
||||
return norm_leven
|
||||
|
||||
# %%
|
||||
n = len(df)
|
||||
distance_array = np.zeros((n), dtype=float)
|
||||
|
||||
desc_array = df['tag_description']
|
||||
thing_property_array = df['thing_property']
|
||||
|
||||
# %%
|
||||
# compute normalized levenshtein distance
|
||||
for i in tqdm(range(n)):
|
||||
string1 = desc_array[i]
|
||||
string2 = thing_property_array[i]
|
||||
distance_array[i] = compute_norm_leven(string1, string2)
|
||||
|
||||
# %%
|
||||
distance_array
|
||||
|
||||
|
||||
# %%
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||
plt.xlabel("Normalized Levenshtein Distance")
|
||||
plt.ylabel("Count")
|
||||
plt.tight_layout()
|
||||
plt.savefig("histogram.png", dpi=300)
|
||||
#
|
||||
# %%
|
||||
# summary statistics of computed levenshtein distance
|
||||
def summary_stats(arr):
|
||||
return {
|
||||
"Mean": np.mean(arr),
|
||||
"Median": np.median(arr),
|
||||
"Standard Deviation": np.std(arr),
|
||||
"Variance": np.var(arr),
|
||||
"Min": np.min(arr),
|
||||
"Max": np.max(arr),
|
||||
"Range": np.ptp(arr),
|
||||
"25th Percentile": np.percentile(arr, 25),
|
||||
"75th Percentile": np.percentile(arr, 75),
|
||||
"Sum": np.sum(arr),
|
||||
}
|
||||
|
||||
stats = summary_stats(distance_array)
|
||||
|
||||
for key, value in stats.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
# %%
|
|
@ -0,0 +1,88 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import Levenshtein
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# %%
|
||||
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
df['thing_property'] = df['thing'] + df['property']
|
||||
mdm_list = sorted(list((set(df['thing_property']))))
|
||||
|
||||
|
||||
# %%
|
||||
def compute_norm_leven(string1, string2):
|
||||
max_distance = max(len(string1), len(string2))
|
||||
leven_distance = Levenshtein.distance(string1, string2)
|
||||
norm_leven = leven_distance / max_distance
|
||||
return norm_leven
|
||||
|
||||
def compute_avg_score(strings):
|
||||
n = len(strings)
|
||||
|
||||
# if group only has 1 string, then it is fully similar to itself
|
||||
if n == 1:
|
||||
return 0
|
||||
|
||||
# Create an empty matrix
|
||||
distance_matrix = np.zeros((n, n), dtype=float)
|
||||
|
||||
# Fill only the upper triangular part
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
dist = compute_norm_leven(strings[i], strings[j])
|
||||
distance_matrix[i, j] = dist
|
||||
|
||||
upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
|
||||
mean_distance = np.mean(upper_triangular_distances)
|
||||
return mean_distance
|
||||
|
||||
|
||||
# %%
|
||||
# we want to subset to each class
|
||||
n = len(mdm_list)
|
||||
score_list = np.zeros((n), dtype=float)
|
||||
|
||||
for i in range(n):
|
||||
df_subset = df[df['thing_property'] == mdm_list[i]]
|
||||
strings = df_subset['tag_description'].to_numpy()
|
||||
score_list[i] = compute_avg_score(strings)
|
||||
|
||||
|
||||
# %%
|
||||
score_list
|
||||
|
||||
|
||||
# %%
|
||||
# plt.hist(score_list, bins=50)
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||||
plt.xlabel("Normalized Levenshtein Distance")
|
||||
plt.ylabel("Platform Domain Class Count")
|
||||
plt.tight_layout()
|
||||
plt.savefig("histogram.png", dpi=300)
|
||||
# %%
|
||||
# summary statistics of computed levenshtein distance
|
||||
def summary_stats(arr):
|
||||
return {
|
||||
"Mean": np.mean(arr),
|
||||
"Median": np.median(arr),
|
||||
"Standard Deviation": np.std(arr),
|
||||
"Variance": np.var(arr),
|
||||
"Min": np.min(arr),
|
||||
"Max": np.max(arr),
|
||||
"Range": np.ptp(arr),
|
||||
"25th Percentile": np.percentile(arr, 25),
|
||||
"75th Percentile": np.percentile(arr, 75),
|
||||
"Sum": np.sum(arr),
|
||||
}
|
||||
|
||||
stats = summary_stats(score_list)
|
||||
|
||||
for key, value in stats.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
# %%
|
|
@ -181,6 +181,7 @@ remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
|
|||
|
||||
# Allocate remaining ships to the groups
|
||||
while len(remaining_ships) > 0:
|
||||
# re-compute the counts for each group
|
||||
group_comb_counts = []
|
||||
for g in range(num_groups):
|
||||
group_ships = groups[g]
|
||||
|
@ -190,18 +191,21 @@ while len(remaining_ships) > 0:
|
|||
|
||||
group_comb_counts.sort(key=lambda x: x[1])
|
||||
|
||||
# reset the remaining_group list
|
||||
remaining_group = []
|
||||
# g is the identifier for the group
|
||||
for g, _ in group_comb_counts:
|
||||
if len(remaining_ships) == 0:
|
||||
break
|
||||
|
||||
# compute for each group, the selected ship, and the combined count increase
|
||||
if group_comb_counts.index((g, _)) == 0:
|
||||
selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
|
||||
|
||||
else:
|
||||
max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
|
||||
selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
|
||||
|
||||
# if the combined increase is 0, then we process it in a special manner
|
||||
if comb_increase == 0:
|
||||
remaining_group.append(g)
|
||||
else:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import os
|
||||
from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
|
||||
from inference import Inference, Embedder_t5_encoder
|
||||
import numpy as np
|
||||
from sklearn.manifold import TSNE
|
||||
import matplotlib.pyplot as plt
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
*.csv
|
||||
fold_*
|
||||
__pycache__
|
|
@ -0,0 +1,154 @@
|
|||
# Shaft# class post-processing
|
||||
shaft_rules = [
|
||||
{ "conditions": [
|
||||
lambda x: "NO.1" in x,
|
||||
lambda x: "SHAFT" in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO.1" in x,
|
||||
lambda x: "Shaft" in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO.2" in x,
|
||||
lambda x: "Shaft" in x
|
||||
],
|
||||
"action": 'Shaft2'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO.2" in x,
|
||||
lambda x: "SHAFT" in x
|
||||
],
|
||||
"action": 'Shaft2'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO.1" not in x,
|
||||
lambda x: "NO.2" not in x,
|
||||
lambda x: "SHAFT" not in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO.1" not in x,
|
||||
lambda x: "NO.2" not in x,
|
||||
lambda x: "SHAFT" in x,
|
||||
lambda x: "(P)" in x
|
||||
],
|
||||
"action": 'Shaft2'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO.1" not in x,
|
||||
lambda x: "NO.2" not in x,
|
||||
lambda x: "SHAFT" in x,
|
||||
lambda x: "(S)" in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO.1" not in x,
|
||||
lambda x: "NO.2" not in x,
|
||||
lambda x: "SHAFT" in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "(P)" not in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ME# class post-processing
|
||||
ME_rules = [
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "(P)" not in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "GE" not in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "(P)" in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME2Flow'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "(S)" in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "ME" not in x,
|
||||
lambda x: "GE" not in x,
|
||||
lambda x: "(P)" not in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "GE" not in x,
|
||||
lambda x: "(P)" not in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "FLOW" not in x,
|
||||
lambda x: "CONSUMPTION" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
]
|
||||
|
||||
# GEFlow rules
|
||||
GEFlow_rules = [
|
||||
{ "conditions": [lambda x: "NO." not in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE1Flow'
|
||||
},
|
||||
{ "conditions": [ lambda x: "NO.1" in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE1Flow'
|
||||
},
|
||||
|
||||
{ "conditions": [ lambda x: "NO.2" in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE2Flow'
|
||||
},
|
||||
{ "conditions": [ lambda x: "NO.3" in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE3Flow'
|
||||
},
|
||||
|
||||
{ "conditions": [ lambda x: "NO." not in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "CONSUMPTION" in x,
|
||||
|
||||
],
|
||||
"action": 'GE1Flow'
|
||||
},
|
||||
]
|
|
@ -0,0 +1,233 @@
|
|||
# %%
|
||||
import os
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
from post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
|
||||
|
||||
|
||||
# %%
|
||||
# Function to print classification metrics
|
||||
def print_classification_metrics(df):
|
||||
print("Results before post-processing")
|
||||
# print(classification_report(df["thing"], df["p_thing"].fillna("")))
|
||||
report = classification_report(df["thing"], df["p_thing"].fillna(""), output_dict=True)
|
||||
# Extracting the weighted average values for precision, recall, and F1-score
|
||||
precision_avg = report['weighted avg']['precision']
|
||||
recall_avg = report['weighted avg']['recall']
|
||||
f1_avg = report['weighted avg']['f1-score']
|
||||
# Print the averages
|
||||
print("Average Precision:", precision_avg)
|
||||
print("Average Recall:", recall_avg)
|
||||
print("Average F1-Score:", f1_avg)
|
||||
|
||||
print("**************")
|
||||
|
||||
print("Results after post-processing")
|
||||
# print(classification_report(df["thing"], df["edited_p_thing"].fillna("")))
|
||||
report = classification_report(df["thing"], df["edited_p_thing"].fillna(""), output_dict=True)
|
||||
# Extracting the weighted average values for precision, recall, and F1-score
|
||||
precision_avg = report['weighted avg']['precision']
|
||||
recall_avg = report['weighted avg']['recall']
|
||||
f1_avg = report['weighted avg']['f1-score']
|
||||
# Print the averages
|
||||
print("Average Precision:", precision_avg)
|
||||
print("Average Recall:", recall_avg)
|
||||
print("Average F1-Score:", f1_avg)
|
||||
|
||||
|
||||
def read_data():
|
||||
BASE_FOLDER = "fold"
|
||||
BASE_FILE_NAME = "result_group"
|
||||
NUM_FOLDS = 5
|
||||
|
||||
# List for storing all DataFrames
|
||||
dataframes = []
|
||||
|
||||
# Iterate over all fold
|
||||
for i in range(NUM_FOLDS):
|
||||
fold_folder = f"{BASE_FOLDER}_{i+1}"
|
||||
desired_file_name = f"{BASE_FILE_NAME}_{i+1}.csv"
|
||||
for file_name in os.listdir(fold_folder):
|
||||
if file_name==desired_file_name:
|
||||
file_path = os.path.join(fold_folder, file_name)
|
||||
|
||||
df = pd.read_csv(file_path, index_col=0)
|
||||
dataframes.append(df)
|
||||
|
||||
# Combine all DataFrames into one
|
||||
final_dataframe = pd.concat(dataframes)
|
||||
final_dataframe = final_dataframe[final_dataframe["MDM"]].reset_index(drop=True)
|
||||
# assign a copy
|
||||
final_dataframe["edited_p_thing"] = final_dataframe["p_thing"]
|
||||
return final_dataframe
|
||||
|
||||
|
||||
def update_shipboiler_p_thing(df, tag_column='tag_description', thing_pred_column='edited_p_thing'):
|
||||
"""
|
||||
Update the 'thing' column in the DataFrame based on rules applied to 'tag_description' column.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame to apply the function on.
|
||||
- tag_column: Column name containing descriptions to base the logic on.
|
||||
- thing_pred_column: Column to be updated with new values based on conditions.
|
||||
|
||||
Returns:
|
||||
- DataFrame with updated 'thing' column.
|
||||
"""
|
||||
# Fill NaN values in tag column to avoid errors with ~ operator
|
||||
df[tag_column] = df[tag_column].fillna('')
|
||||
|
||||
# Apply rules for "NO.1", "NO.2", and "AUXILIARY" in tag_column
|
||||
df.loc[df[tag_column].str.contains("NO.1", case=False), thing_pred_column] = "ShipBoiler1"
|
||||
df.loc[df[tag_column].str.contains("NO.2", case=False), thing_pred_column] = "ShipBoiler2"
|
||||
df.loc[(df[tag_column].str.contains("AUXILIARY", case=False)) & (~df[tag_column].str.contains("NO.", case=False)), thing_pred_column] = "ShipBoiler1"
|
||||
|
||||
# Determine the highest number for "thing" in rows without "COMPOSITE" and assign to "COMPOSITE" rows
|
||||
max_boiler_number = df.loc[~df[tag_column].str.contains("COMPOSITE", case=False) & df[thing_pred_column].str.contains("ShipBoiler", na=False), thing_pred_column] \
|
||||
.str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
|
||||
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
|
||||
|
||||
# Apply composite_boiler to rows containing "COMPOSITE"
|
||||
df.loc[df[tag_column].str.contains("COMPOSITE", case=False), thing_pred_column] = composite_boiler
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def process_boiler_data(boiler_data):
|
||||
updated_dataframes = []
|
||||
for _, group in boiler_data.groupby('ships_idx'):
|
||||
group_copy = group.copy()
|
||||
|
||||
group_copy["tag_description"] = group_copy["tag_description"].fillna('')
|
||||
|
||||
contains_no1_cond = group_copy["tag_description"].str.contains("NO.1")
|
||||
contains_no2_cond = group_copy["tag_description"].str.contains("NO.2")
|
||||
contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
|
||||
doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO.")
|
||||
contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
|
||||
contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
|
||||
|
||||
group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||
group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
|
||||
group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||
|
||||
if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
|
||||
max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
|
||||
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
|
||||
|
||||
if max_boiler_number > 3:
|
||||
max_boiler_number = 3
|
||||
|
||||
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
|
||||
else:
|
||||
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
|
||||
|
||||
updated_dataframes.append(group_copy) # Collect updated group
|
||||
|
||||
# Step 2: Concatenate all updated groups
|
||||
updated_boiler_data = pd.concat(updated_dataframes)
|
||||
return updated_boiler_data
|
||||
|
||||
|
||||
def check_conditions(value,conditions):
|
||||
# Check if a value satisfies all conditions
|
||||
return all(condition(value) for condition in conditions)
|
||||
|
||||
|
||||
def apply_rules(description, thing, rules):
|
||||
#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
|
||||
for rule in rules:
|
||||
if check_conditions(description, rule["conditions"]): #Check that all conditions are met
|
||||
return rule["action"] #Execute the action and return the result
|
||||
return thing #Returns the value of the thing column if it doesn't match any of the rules
|
||||
|
||||
|
||||
# %%
|
||||
# if __name__ == "__main__":
|
||||
|
||||
# %%
|
||||
# Read and preprocess data
|
||||
final_dataframe = read_data()
|
||||
|
||||
# %%
|
||||
final_dataframe
|
||||
# %%
|
||||
# Hwanggawi main function
|
||||
#Get partial columns
|
||||
TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
|
||||
|
||||
#Shaft
|
||||
SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
|
||||
SF_df_in_MDM = SF_df[(SF_df['MDM'])]
|
||||
|
||||
#ME
|
||||
ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
|
||||
TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
|
||||
ME_df_in_MDM = ME_df[(ME_df['MDM'])]
|
||||
|
||||
#GE
|
||||
GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
|
||||
TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
|
||||
GE_df_in_MDM = GE_df[(GE_df['MDM'])]
|
||||
|
||||
SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
|
||||
GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
|
||||
ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
|
||||
|
||||
# ShipBoiler class post-processing
|
||||
mdm = final_dataframe[final_dataframe["MDM"]].copy()
|
||||
boiler_data = mdm[mdm["thing"].str.contains("Boiler")].copy()
|
||||
|
||||
blr_cond = boiler_data["tag_description"].str.lower().str.contains("blr")
|
||||
boiler_cond = boiler_data["tag_description"].str.lower().str.contains("boiler")
|
||||
boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
|
||||
different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
|
||||
|
||||
unique_ships_idxs = boiler_data["ships_idx"].unique()
|
||||
|
||||
boiler_data["edited_p_thing"] = boiler_data["p_thing"]
|
||||
|
||||
updated_boiler_data = process_boiler_data(boiler_data)
|
||||
|
||||
# Save updated data back to the original DataFrame
|
||||
final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
|
||||
|
||||
|
||||
result = SF_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
|
||||
SF_df_in_MDM['edited_p_thing'] = result
|
||||
|
||||
# Save updated data back to the original DataFrame
|
||||
final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
|
||||
|
||||
|
||||
result = ME_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
|
||||
ME_df_in_MDM['edited_p_thing'] = result
|
||||
final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
|
||||
|
||||
|
||||
result = GE_df_in_MDM.apply(lambda x: apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
|
||||
GE_df_in_MDM['edited_p_thing'] = result
|
||||
final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
|
||||
|
||||
|
||||
# output final dataframe
|
||||
final_dataframe.to_csv("post_processed_df.csv", index=False)
|
||||
print("Saved output to post_processed_df.csv")
|
||||
# %%
|
||||
# print results
|
||||
print("ShipBoiler post-processing results")
|
||||
print_classification_metrics(updated_boiler_data)
|
||||
print("----------------------------")
|
||||
|
||||
print("ShipBoiler post-processing results")
|
||||
print_classification_metrics(ME_df_in_MDM)
|
||||
print("----------------------------")
|
||||
|
||||
print("ShipBoiler post-processing results")
|
||||
print_classification_metrics(GE_df_in_MDM)
|
||||
print("----------------------------")
|
||||
|
||||
# %%
|
|
@ -265,7 +265,7 @@ def run_deduplication(
|
|||
print('generate train embeddings')
|
||||
train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
|
||||
tensor = train_embedder.make_embedding(checkpoint_path)
|
||||
torch.save(tensor, file_path, weights_only=True)
|
||||
torch.save(tensor, file_path)
|
||||
print("Tensor saved to file.")
|
||||
|
||||
train_embeds = tensor
|
|
@ -0,0 +1,158 @@
|
|||
# Shaft# class post-processing
|
||||
shaft_rules = [
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" in x,
|
||||
lambda x: "SHAFT" in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" in x,
|
||||
lambda x: "Shaft" in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO2" in x,
|
||||
lambda x: "Shaft" in x
|
||||
],
|
||||
"action": 'Shaft2'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO2" in x,
|
||||
lambda x: "SHAFT" in x
|
||||
],
|
||||
"action": 'Shaft2'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" not in x,
|
||||
lambda x: "NO2" not in x,
|
||||
lambda x: "SHAFT" not in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" not in x,
|
||||
lambda x: "NO2" not in x,
|
||||
lambda x: "SHAFT" in x,
|
||||
lambda x: "(P)" in x
|
||||
],
|
||||
"action": 'Shaft2'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" not in x,
|
||||
lambda x: "NO2" not in x,
|
||||
lambda x: "SHAFT" in x,
|
||||
lambda x: "(S)" in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" not in x,
|
||||
lambda x: "NO2" not in x,
|
||||
lambda x: "SHAFT" in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "(P)" not in x
|
||||
],
|
||||
"action": 'Shaft1'
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ME# class post-processing
|
||||
ME_rules = [
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "(P)" not in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "GE" not in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "(P)" in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME2Flow'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "(S)" in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "ME" not in x,
|
||||
lambda x: "GE" not in x,
|
||||
lambda x: "(P)" not in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "FLOW" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "ME" in x,
|
||||
lambda x: "GE" not in x,
|
||||
lambda x: "(P)" not in x,
|
||||
lambda x: "(S)" not in x,
|
||||
lambda x: "FLOW" not in x,
|
||||
lambda x: "CONSUMPTION" in x,
|
||||
],
|
||||
"action": 'ME1Flow'
|
||||
},
|
||||
]
|
||||
|
||||
# GEFlow rules
|
||||
GEFlow_rules = [
|
||||
{ "conditions": [
|
||||
lambda x: "NO" not in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
],
|
||||
"action": 'GE1Flow'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO1" in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE1Flow'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO2" in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE2Flow'
|
||||
},
|
||||
{ "conditions": [
|
||||
lambda x: "NO3" in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "MGO" in x,
|
||||
|
||||
],
|
||||
"action": 'GE3Flow'
|
||||
},
|
||||
|
||||
{ "conditions": [
|
||||
lambda x: "NO" not in x,
|
||||
lambda x: "GE" in x,
|
||||
lambda x: "CONSUMPTION" in x,
|
||||
|
||||
],
|
||||
"action": 'GE1Flow'
|
||||
},
|
||||
]
|
|
@ -1,6 +1,6 @@
|
|||
# %%
|
||||
import re
|
||||
from replacement_dict import desc_replacement_dict, unit_replacement_dict
|
||||
from end_to_end.replacement_dict import desc_replacement_dict, unit_replacement_dict
|
||||
|
||||
class Abbreviator:
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from end_to_end.post_processing_rules import shaft_rules, ME_rules, GEFlow_rules
|
||||
|
||||
class Corrector():
|
||||
|
||||
def __init__(self, df):
|
||||
# copy over the existing
|
||||
df['edited_p_thing'] = df['p_thing'].copy()
|
||||
self.df = df
|
||||
|
||||
|
||||
def _process_boiler_data(self, boiler_data):
|
||||
updated_dataframes = []
|
||||
for _, group in boiler_data.groupby('ships_idx'):
|
||||
group_copy = group.copy()
|
||||
|
||||
group_copy["tag_description"] = group_copy["tag_description"].fillna('')
|
||||
|
||||
contains_no1_cond = group_copy["tag_description"].str.contains("NO1")
|
||||
contains_no2_cond = group_copy["tag_description"].str.contains("NO2")
|
||||
contains_aux_cond = group_copy["tag_description"].str.contains("AUXILIARY")
|
||||
doesnt_contain_no_cond = ~group_copy["tag_description"].str.contains("NO")
|
||||
contains_comp_cond = group_copy["tag_description"].str.contains("COMPOSITE")
|
||||
contains_shipboiler_cond = group_copy["edited_p_thing"].str.contains("ShipBoiler")
|
||||
|
||||
group_copy.loc[contains_no1_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||
group_copy.loc[contains_no2_cond, "edited_p_thing"] = "ShipBoiler2"
|
||||
group_copy.loc[contains_aux_cond&doesnt_contain_no_cond, "edited_p_thing"] = "ShipBoiler1"
|
||||
|
||||
if ((~contains_comp_cond) & (contains_shipboiler_cond)).any():
|
||||
max_boiler_number = group_copy.loc[(~contains_comp_cond)&(contains_shipboiler_cond), "edited_p_thing"].str.extract(r'(\d+)$').astype(float).max().fillna(0)[0] + 1
|
||||
composite_boiler = f"ShipBoiler{int(max_boiler_number)}"
|
||||
|
||||
if max_boiler_number > 3:
|
||||
max_boiler_number = 3
|
||||
|
||||
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = composite_boiler
|
||||
else:
|
||||
group_copy.loc[group_copy["tag_description"].str.contains("COMPOSITE"), "edited_p_thing"] = "ShipBoiler1"
|
||||
|
||||
updated_dataframes.append(group_copy) # Collect updated group
|
||||
|
||||
# Step 2: Concatenate all updated groups
|
||||
if (len(updated_dataframes) == 0):
|
||||
return boiler_data
|
||||
updated_boiler_data = pd.concat(updated_dataframes)
|
||||
return updated_boiler_data
|
||||
|
||||
def _check_conditions(self, value, conditions):
|
||||
# Check if a value satisfies all conditions
|
||||
return all(condition(value) for condition in conditions)
|
||||
|
||||
def _apply_rules(self, description, thing, rules):
|
||||
#Processes the description according to the rule table and returns the replacement value if the condition is met, otherwise returns the thing value
|
||||
for rule in rules:
|
||||
if self._check_conditions(description, rule["conditions"]): #Check that all conditions are met
|
||||
return rule["action"] #Execute the action and return the result
|
||||
return thing #Returns the value of the thing column if it doesn't match any of the rules
|
||||
|
||||
def run_correction(self):
|
||||
final_dataframe = self.df.copy()
|
||||
# Hwanggawi main function
|
||||
#Get partial columns
|
||||
TP_df = final_dataframe.loc[:, ['thing', 'property','p_thing','p_property','tag_description','MDM']].copy()
|
||||
|
||||
#Shaft
|
||||
SF_df = TP_df[TP_df['thing'].str.contains(('Shaft'), case=False, na=False)]
|
||||
SF_df_in_MDM = SF_df[(SF_df['MDM'])]
|
||||
|
||||
#ME
|
||||
ME_df = TP_df[TP_df['thing'].str.contains(('ME1Flow'), case=False, na=False)|
|
||||
TP_df['thing'].str.contains(('ME2Flow'), case=False, na=False)|
|
||||
TP_df['thing'].str.contains(('ME3Flow'), case=False, na=False)]
|
||||
ME_df_in_MDM = ME_df[(ME_df['MDM'])]
|
||||
|
||||
#GE
|
||||
GE_df = TP_df[TP_df['thing'].str.contains(('GE1Flow'), case=False, na=False)|
|
||||
TP_df['thing'].str.contains(('GE2Flow'), case=False, na=False)|
|
||||
TP_df['thing'].str.contains(('GE3Flow'), case=False, na=False)]
|
||||
GE_df_in_MDM = GE_df[(GE_df['MDM'])]
|
||||
|
||||
SF_df_in_MDM['standardize_desc'] = SF_df_in_MDM['tag_description'].copy()
|
||||
GE_df_in_MDM['standardize_desc'] = GE_df_in_MDM['tag_description'].copy()
|
||||
ME_df_in_MDM['standardize_desc'] = ME_df_in_MDM['tag_description'].copy()
|
||||
|
||||
# ShipBoiler class post-processing
|
||||
mdm = final_dataframe[final_dataframe["MDM"]].copy()
|
||||
boiler_data = mdm[mdm["thing"].str.contains("BOILER")].copy()
|
||||
|
||||
# blr_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
|
||||
# boiler_cond = boiler_data["tag_description"].str.lower().str.contains("BOILER")
|
||||
# boiler_data.shape[0]-(boiler_data[blr_cond].shape[0]+boiler_data[boiler_cond].shape[0])
|
||||
|
||||
# different_cond = boiler_data[~(blr_cond|boiler_cond)].copy()
|
||||
|
||||
# unique_ships_idxs = boiler_data["ships_idx"].unique()
|
||||
|
||||
boiler_data["edited_p_thing"] = boiler_data["p_thing"]
|
||||
updated_boiler_data = self._process_boiler_data(boiler_data)
|
||||
final_dataframe.loc[updated_boiler_data.index, "edited_p_thing"] = updated_boiler_data["edited_p_thing"]
|
||||
|
||||
result = SF_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], shaft_rules),axis=1)
|
||||
SF_df_in_MDM['edited_p_thing'] = result
|
||||
final_dataframe.loc[SF_df_in_MDM.index, "edited_p_thing"] = SF_df_in_MDM['edited_p_thing']
|
||||
|
||||
result = ME_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], ME_rules),axis=1)
|
||||
ME_df_in_MDM['edited_p_thing'] = result
|
||||
final_dataframe.loc[ME_df_in_MDM.index, "edited_p_thing"] = ME_df_in_MDM['edited_p_thing']
|
||||
|
||||
|
||||
result = GE_df_in_MDM.apply(lambda x: self._apply_rules(x['standardize_desc'],x['p_thing'], GEFlow_rules),axis=1)
|
||||
GE_df_in_MDM['edited_p_thing'] = result
|
||||
final_dataframe.loc[GE_df_in_MDM.index, "edited_p_thing"] = GE_df_in_MDM['edited_p_thing']
|
||||
|
||||
# override p_thing with edited_p_thing
|
||||
final_dataframe['p_thing'] = final_dataframe['edited_p_thing'].copy()
|
||||
|
||||
return final_dataframe
|
|
@ -2,13 +2,16 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import glob
|
||||
from mapper import Mapper
|
||||
from preprocess import Abbreviator
|
||||
from deduplication import run_deduplication
|
||||
from end_to_end.mapper import Mapper
|
||||
from end_to_end.preprocess import Abbreviator
|
||||
from end_to_end.deduplication import run_deduplication
|
||||
from end_to_end.rule_based_correction import Corrector
|
||||
|
||||
|
||||
# global config
|
||||
BATCH_SIZE = 512
|
||||
SHIPS_LIST = [1000,1001,1003,1004]
|
||||
# SHIPS_LIST = [1000]
|
||||
|
||||
# %%
|
||||
# START: we import the raw data csv and extract only a few ships from it to simulate incoming json
|
||||
|
@ -47,6 +50,12 @@ df_out = pd.DataFrame({
|
|||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# %%
|
||||
###################################
|
||||
# run rule-based correction
|
||||
corrector = Corrector(df)
|
||||
df = corrector.run_correction()
|
||||
|
||||
|
||||
# %%
|
||||
####################################
|
||||
|
@ -59,7 +68,7 @@ df = run_deduplication(
|
|||
test_df=df,
|
||||
train_df=train_df,
|
||||
batch_size=BATCH_SIZE,
|
||||
threshold=0.85,
|
||||
threshold=0.9,
|
||||
diagnostic=True)
|
||||
|
||||
# %%
|
|
@ -0,0 +1,2 @@
|
|||
checkpoint*
|
||||
tensorboard-log/
|
|
@ -0,0 +1,196 @@
|
|||
# %%
|
||||
|
||||
# from datasets import load_from_disk
|
||||
import os
|
||||
|
||||
os.environ['NCCL_P2P_DISABLE'] = '1'
|
||||
os.environ['NCCL_IB_DISABLE'] = '1'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
DataCollatorForSeq2Seq,
|
||||
Seq2SeqTrainer,
|
||||
EarlyStoppingCallback,
|
||||
Seq2SeqTrainingArguments
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision('high')
|
||||
|
||||
# outputs a list of dictionaries
|
||||
def process_df_to_dict(df):
|
||||
output_list = []
|
||||
for _, row in df.iterrows():
|
||||
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||
element = {
|
||||
'input' : f"{desc}{unit}",
|
||||
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_split_dataset():
|
||||
# train
|
||||
data_path = "../../data_preprocess/exports/preprocessed_data.csv"
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
train_df = train_df[train_df['MDM']].reset_index(drop=True)
|
||||
|
||||
# valid
|
||||
data_path = "../../data_preprocess/exports/dataset/group_1/valid.csv"
|
||||
validation_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
combined_data = DatasetDict({
|
||||
'train': Dataset.from_list(process_df_to_dict(train_df)),
|
||||
'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
|
||||
})
|
||||
return combined_data
|
||||
|
||||
|
||||
# function to perform training for a given fold
|
||||
def train():
|
||||
save_path = 'checkpoint'
|
||||
split_datasets = create_split_dataset()
|
||||
|
||||
# prepare tokenizer
|
||||
|
||||
model_checkpoint = "t5-small"
|
||||
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
max_length = 120
|
||||
|
||||
# given a dataset entry, run it through the tokenizer
|
||||
def preprocess_function(example):
|
||||
input = example['input']
|
||||
target = example['output']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = tokenizer(
|
||||
input,
|
||||
text_target=target,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding=True
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
tokenized_datasets = split_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=8,
|
||||
remove_columns=split_datasets["train"].column_names,
|
||||
)
|
||||
|
||||
# https://github.com/huggingface/transformers/pull/28414
|
||||
# model_checkpoint = "google/t5-efficient-tiny"
|
||||
# device_map set to auto to force it to load contiguous weights
|
||||
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
|
||||
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||
# important! after extending tokens vocab
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||
metric = evaluate.load("sacrebleu")
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels = eval_preds
|
||||
# In case the model returns more than the prediction logits
|
||||
if isinstance(preds, tuple):
|
||||
preds = preds[0]
|
||||
|
||||
decoded_preds = tokenizer.batch_decode(preds,
|
||||
skip_special_tokens=False)
|
||||
|
||||
# Replace -100s in the labels as we can't decode them
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels,
|
||||
skip_special_tokens=False)
|
||||
|
||||
# Remove <PAD> tokens from decoded predictions and labels
|
||||
decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
|
||||
decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
|
||||
|
||||
# Some simple post-processing
|
||||
# decoded_preds = [pred.strip() for pred in decoded_preds]
|
||||
# decoded_labels = [[label.strip()] for label in decoded_labels]
|
||||
# print(decoded_preds, decoded_labels)
|
||||
|
||||
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
|
||||
return {"bleu": result["score"]}
|
||||
|
||||
|
||||
# Generation Config
|
||||
# from transformers import GenerationConfig
|
||||
gen_config = model.generation_config
|
||||
gen_config.max_length = 64
|
||||
|
||||
# compile
|
||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||
|
||||
|
||||
# Trainer
|
||||
|
||||
args = Seq2SeqTrainingArguments(
|
||||
f"{save_path}",
|
||||
# eval_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
logging_dir="tensorboard-log",
|
||||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-3,
|
||||
per_device_train_batch_size=128,
|
||||
per_device_eval_batch_size=128,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=80,
|
||||
predict_with_generate=True,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
generation_config=gen_config,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Seq2SeqTrainer(
|
||||
model,
|
||||
args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["validation"],
|
||||
data_collator=data_collator,
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics,
|
||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||
)
|
||||
|
||||
# uncomment to load training from checkpoint
|
||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# execute training
|
||||
train()
|
||||
|
Loading…
Reference in New Issue