Feat: added abbreviation expansion rules

This commit is contained in:
Richard Wong 2024-11-10 20:28:47 +09:00
parent 59bbf1f403
commit 2b5994cb52
21 changed files with 152503 additions and 370 deletions

1
analysis/bert/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

View File

@ -0,0 +1,285 @@
# %%
import pandas as pd
from utils import Retriever, cosine_similarity_chunked
import os
import glob
import numpy as np
# %%
fold = 5
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
print(len(error_thing_df))
print(len(error_property_df))
# %%
# thing_df.to_html('thing_errors.html')
# property_df.to_html('property_errors.html')
##########################################
# what we need now is understand why the model is making these mispredictions
# import train data and test data
# %%
class Embedder():
input_df: pd.DataFrame
fold: int
def __init__(self, input_df):
self.input_df = input_df
def make_embedding(self, checkpoint_path):
def generate_input_list(df):
input_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = f"{desc}{unit}"
input_list.append(element)
return input_list
# prepare reference embed
train_data = list(generate_input_list(self.input_df))
# Define the directory and the pattern
retriever_train = Retriever(train_data, checkpoint_path)
retriever_train.make_embedding(batch_size=64)
return retriever_train.embeddings.to('cpu')
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
checkpoint_directory = "../../train/classification_bert"
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
train_embedder = Embedder(input_df=train_df)
train_embeds = train_embedder.make_embedding(checkpoint_path)
test_embedder = Embedder(input_df=test_df)
test_embeds = test_embedder.make_embedding(checkpoint_path)
# %%
# test embeds are inputs since we are looking back at train data
cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
# %%
# the following function takes in a full cos_sim_matrix
# condition_source: boolean selectors of the source embedding
# condition_target: boolean selectors of the target embedding
def find_closest(cos_sim_matrix, condition_source, condition_target):
# subset_matrix = cos_sim_matrix[condition_source]
# except we are subsetting 2D matrix (row, column)
subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
# we select top k here
# Get the indices of the top 5 maximum values along axis 1
top_k = 3
top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values
# note that top_k_indices is a nested list because of the 2d nature of the matrix
# the result is flipped
top_k_indices[0] = top_k_indices[0][::-1]
# Get the values of the top 5 maximum scores
top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
return top_k_indices, top_k_values
####################################################
# special find-back code
# %%
def find_back_element_with_print(select_idx):
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
top_k_indices, top_k_values = find_closest(
cos_sim_matrix=cos_sim_matrix,
condition_source=condition_source,
condition_target=condition_target)
training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
predicted_test_data = predicted_test_data.to_list()[0]
print("*" * 80)
print("idx:", select_idx)
print("train desc", training_desc_list)
print("train thing+property", training_data_pattern_list)
print("test desc", test_desc_list)
print("test thing+property", test_data_pattern_list)
print("predicted thing+property", predicted_test_data)
print("ships idx", test_ship_id)
print("score:", top_k_values[0])
test_pattern = test_data_pattern_list[0]
find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
if sum(find_back_list) > 0:
return True
else:
return False
# %%
def find_back_element(select_idx):
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
top_k_indices, top_k_values = find_closest(
cos_sim_matrix=cos_sim_matrix,
condition_source=condition_source,
condition_target=condition_target)
training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
# print(training_data_pattern_list)
# print(test_data_pattern_list)
test_pattern = test_data_pattern_list[0]
find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
if sum(find_back_list) > 0:
return True
else:
return False
# %%
# for error thing
pattern_in_train = []
for select_idx in error_thing_df.index:
result = find_back_element_with_print(select_idx)
print("status:", result)
pattern_in_train.append(result)
sum(pattern_in_train)/len(pattern_in_train)
###
# for error property
# %%
pattern_in_train = []
for select_idx in error_property_df.index:
result = find_back_element_with_print(select_idx)
print("status:", result)
pattern_in_train.append(result)
sum(pattern_in_train)/len(pattern_in_train)
####################################################
# %%
# make function to compute similarity of closest retrieved result
def compute_similarity(select_idx):
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
top_k_indices, top_k_values = find_closest(
cos_sim_matrix=cos_sim_matrix,
condition_source=condition_source,
condition_target=condition_target)
return np.mean(top_k_values[0])
# %%
def print_summary(similarity_scores):
# Convert list to numpy array for additional stats
np_array = np.array(similarity_scores)
# Get stats
mean_value = np.mean(np_array)
percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles
# Display numpy results
print("Mean:", mean_value)
print("25th, 50th, 75th Percentiles:", percentiles)
# %%
##########################################
# Analyze the degree of similarity differences between correct and incorrect results
# %%
# compute similarity scores for all values in error_thing_df
similarity_thing_scores = []
for idx in error_thing_df.index:
similarity_thing_scores.append(compute_similarity(idx))
print_summary(similarity_thing_scores)
# %%
similarity_property_scores = []
for idx in error_property_df.index:
similarity_property_scores.append(compute_similarity(idx))
print_summary(similarity_property_scores)
# %%
similarity_correct_scores = []
for idx in correct_df.index:
similarity_correct_scores.append(compute_similarity(idx))
print_summary(similarity_correct_scores)
# %%
import matplotlib.pyplot as plt
# Sample data
list1 = similarity_thing_scores
list2 = similarity_property_scores
list3 = similarity_correct_scores
# Plot histograms
bins = 50
plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
# Labels and legend
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(loc='upper right')
plt.title('Histograms of Three Lists')
# Show plot
plt.show()
# %%

71
analysis/bert/utils.py Normal file
View File

@ -0,0 +1,71 @@
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
)
import torch.nn.functional as F
class Retriever:
def __init__(self, input_texts, model_checkpoint):
# we need to generate the embedding from list of input strings
self.embeddings = []
self.inputs = input_texts
model_checkpoint = model_checkpoint
self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
model.to(self.device)
self.model = model.eval()
def make_embedding(self, batch_size=64):
all_embeddings = self.embeddings
input_texts = self.inputs
for i in range(0, len(input_texts), batch_size):
batch_texts = input_texts[i:i+batch_size]
# Tokenize the input text
inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
input_ids = inputs.input_ids.to(self.device)
attention_mask = inputs.attention_mask.to(self.device)
# Pass the input through the encoder and retrieve the embeddings
with torch.no_grad():
encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
# get last layer
embeddings = encoder_outputs.hidden_states[-1]
# get cls token embedding
cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size)
all_embeddings.append(cls_embeddings)
# remove the batch list and makes a single large tensor, dim=0 increases row-wise
all_embeddings = torch.cat(all_embeddings, dim=0)
self.embeddings = all_embeddings
def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
batch1_size = batch1.size(0)
batch2_size = batch2.size(0)
# Prepare an empty tensor to store results
cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
# Process batch1 in chunks
for i in range(0, batch1_size, chunk_size):
batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1
# Expand batch1 chunk and entire batch2 for comparison
batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len)
batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len)
# Compute cosine similarity for the chunk and store it in the final tensor
cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
return cos_sim

View File

@ -0,0 +1,63 @@
# we want to compare the labels between the train data and test data
# %%
import pandas as pd
#########################
# experiment 1
#############
# 1 import test data
# %%
fold = 1
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
##########################
# 2 import train data
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path)
train_pattern = train_df['pattern']
# %%
test_pattern_set = set(test_pattern)
train_pattern_set = set(train_pattern)
# %%
# use this to get labels in test not found in training data
test_pattern_set - train_pattern_set
# verdict: we see that FOMassFlowTotal is not found in the training set
# hence it is not possible for this to be classified correctly
###################################
# experiment 2
# %%
# we want to check load and loadpercent
test_df[test_df['property_pattern'] == 'Load']
# %%
test_df[test_df['property_pattern'] == 'LoadPercent']
# verdict: we see that the units column determine what this should be
# in order to not disturb the model, we should chuck it in to post-process

View File

@ -0,0 +1,77 @@
# we want to compare the labels between the train data and test data
# %%
import pandas as pd
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
df = pd.read_csv(file_path)
df = df[df['MDM']]
# %%
unit_list = df['unit']
unit_list = [elem if (isinstance(elem, str)) else '' for elem in unit_list]
print(sorted(list(set(unit_list))))
# %%
test = ''
# df[df['unit'] == test]['property_pattern'].to_list()
df[df['unit'] == test]
#############
# 1 import test data
# %%
fold = 1
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
##########################
# 2 import train data
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path)
train_pattern = train_df['pattern']
# %%
test_pattern_set = set(test_pattern)
train_pattern_set = set(train_pattern)
# %%
# use this to get labels in test not found in training data
test_pattern_set - train_pattern_set
# verdict: we see that FOMassFlowTotal is not found in the training set
# hence it is not possible for this to be classified correctly
###################################
# experiment 2
# %%
# we want to check load and loadpercent
test_df[test_df['property_pattern'] == 'Load']
# %%
test_df[test_df['property_pattern'] == 'LoadPercent']
#
set(df['unit'])
# %%

View File

@ -2,6 +2,8 @@
Perform substitutions on common terms to standardize abbreviations. Perform substitutions on common terms to standardize abbreviations.
- abbreviations_replacer.py: replaces abbreviations with full terms
## Instructions: ## Instructions:
- `python abbreviations_replacer.py` - `python abbreviations_replacer.py`

View File

@ -5,7 +5,7 @@ Modified by: Richard Wong
# %% # %%
import re import re
import pandas as pd import pandas as pd
from replacement_dict import replacement_dict from replacement_dict import desc_replacement_dict, unit_replacement_dict
# %% # %%
def count_abbreviation_occurrences(tag_descriptions, abbreviation): def count_abbreviation_occurrences(tag_descriptions, abbreviation):
@ -24,6 +24,22 @@ def replace_abbreviations(tag_descriptions, abbreviations):
replaced_descriptions.append(description) replaced_descriptions.append(description)
return replaced_descriptions return replaced_descriptions
def cleanup_spaces(tag_descriptions):
# Replace all whitespace with a single space
replaced_descriptions = []
for description in tag_descriptions:
description_clean = re.sub(r'\s+', ' ', description)
replaced_descriptions.append(description_clean)
return replaced_descriptions
# remove all dots
def cleanup_dots(tag_descriptions):
replaced_descriptions = []
for description in tag_descriptions:
description_clean = re.sub(r'\.', '', description)
replaced_descriptions.append(description_clean)
return replaced_descriptions
# %% # %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
@ -31,16 +47,32 @@ df = pd.read_csv(file_path)
# %% # %%
# Replace abbreviations # Replace abbreviations
print("running substitution") print("running substitution for descriptions")
df['tag_description']= df['tag_description'].fillna("NOVALUE") df['tag_description']= df['tag_description'].fillna("NOVALUE")
# Replace whitespace-only entries with "NOVALUE" # Replace whitespace-only entries with "NOVALUE"
# note that "N/A" can be read as nan # note that "N/A" can be read as nan
# replace whitespace only values as NOVALUE
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True) df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
tag_descriptions = df['tag_description'] tag_descriptions = df['tag_description']
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict) replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
replaced_descriptions = cleanup_spaces(replaced_descriptions)
replaced_descriptions = cleanup_dots(replaced_descriptions)
df["tag_description"] = replaced_descriptions
# print("Descriptions after replacement:", replaced_descriptions) # print("Descriptions after replacement:", replaced_descriptions)
# strip trailing whitespace
df['tag_description'] = df['tag_description'].str.rstrip()
df['tag_description'] = df['tag_description'].str.upper()
# %% # %%
df["tag_description"] = replaced_descriptions print("running substitutions for units")
df['unit'] = df['unit'].fillna("NOVALUE")
df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
unit_list = df['unit']
new_unit = replace_abbreviations(unit_list, unit_replacement_dict)
new_unit = cleanup_spaces(new_unit)
df['unit'] = new_unit
# save
df.to_csv("../exports/preprocessed_data.csv", index=False) df.to_csv("../exports/preprocessed_data.csv", index=False)
print("file saved") print("file saved")

View File

@ -1,113 +1,210 @@
""" # substitution mapping for descriptions
Author: Daniel Kim
"""
# Abbreviations and their replacements # Abbreviations and their replacements
replacement_dict = { desc_replacement_dict = {
r'\bLIST\b': 'LIST', r'\bLIST\b\b': 'LIST',
r'\bList\b': 'LIST', r'\bList\b\b': 'LIST',
r'\bEXH\.': 'EXHAUST', r'\bEXH\.\b': 'EXHAUST',
r'\bEXH\b': 'EXHAUST', r'\bEXH\b\b': 'EXHAUST',
r'\bEXHAUST\.': 'EXHAUST', r'\bEXHAUST\.\b': 'EXHAUST',
r'\bExhaust\b': 'EXHAUST', r'\bExhaust\b\b': 'EXHAUST',
r'\bEXHAUST\b': 'EXHAUST', r'\bEXHAUST\b\b': 'EXHAUST',
r'\bTEMP\.': 'TEMPERATURE', r'\bTEMP\.\b': 'TEMPERATURE',
r'\bTEMP\b': 'TEMPERATURE', r'\bTEMP\b\b': 'TEMPERATURE',
r'\bTEMPERATURE\.': 'TEMPERATURE', r'\bTEMPERATURE\.\b': 'TEMPERATURE',
r'\bTEMPERATURE\b': 'TEMPERATURE', r'\bTEMPERATURE\b\b': 'TEMPERATURE',
r'\bW\.': 'WATER', r'\bW\.\b': 'WATER',
r'\bWATER\b': 'WATER', r'\bWATER\b\b': 'WATER',
r'\bCYL\.': 'CYLINDER', r'\bCW\b\b': 'COOLING WATER',
r'\bcyl\.': 'CYLINDER', r'\bCYL\.\b': 'CYLINDER',
r'\bCYL\b': 'CYLINDER', r'\bCyl\b\b': 'CYLINDER',
r'\bcylinder\b': 'CYLINDER', r'\bcyl\.\b': 'CYLINDER',
r'\bCYLINDER\b': 'CYLINDER', r'\bCYL\b\b': 'CYLINDER',
r'\bCOOL\.': 'COOLING', r'\bCYL(?=\d|\W|$)\b': 'CYLINDER',
r'\bcool\.': 'COOLING', r'\bcylinder\b\b': 'CYLINDER',
r'\bcooling\b': 'COOLING', r'\bCYLINDER\b\b': 'CYLINDER',
r'\bCOOLING\b': 'COOLING', r'\bCOOL\.\b': 'COOLING',
r'\bcooler\b': 'COOLER', r'\bcool\.\b': 'COOLING',
r'\bCOOLER\b': 'COOLER', r'\bcooling\b\b': 'COOLING',
r'\bScav\.': 'SCAVENGE', r'\bCOOLING\b\b': 'COOLING',
r'\bSCAV\.': 'SCAVENGE', r'\bcooler\b\b': 'COOLER',
r'\bINL\.': 'INLET', r'\bCOOLER\b\b': 'COOLER',
r'\binlet\b': 'INLET', r'\bScav\.\b': 'SCAVENGE',
r'\bINLET\b': 'INLET', r'\bSCAV\.\b': 'SCAVENGE',
r'\bOUT\.': 'OUTLET', r'\bINL\.\b': 'INLET',
r'\bOUTL\.': 'OUTLET', r'\binlet\b\b': 'INLET',
r'\boutlet\b': 'OUTLET', r'\bINLET\b\b': 'INLET',
r'\bOUTLET\b': 'OUTLET', r'\bOUT\.\b': 'OUTLET',
r'\bPRESS\.': 'PRESSURE', r'\bOUTL\.\b': 'OUTLET',
r'\bPress\.': 'PRESSURE', r'\boutlet\b\b': 'OUTLET',
r'\bpressure\b': 'PRESSURE', r'\bOUTLET\b\b': 'OUTLET',
r'\bPRESSURE\b': 'PRESSURE', # pressure
r'\bCLR\b': 'CLEAR', r'\bPRESS\b\b': 'PRESSURE',
r'\bENG\.': 'ENGINE', r'\bPRESS\.\b': 'PRESSURE',
r'\bENG\b': 'ENGINE', r'\bPress\.\b': 'PRESSURE',
r'\bENGINE\b': 'ENGINE', r'\bpressure\b\b': 'PRESSURE',
r'\bEngine speed\b': 'ENGINE SPEED', r'\bPRESSURE\b\b': 'PRESSURE',
r'\bEngine running\b': 'ENGINE RUNNING', # this is a special replacement - it is safe to replace PRS w/o checks
r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP', r'PRS\b': 'PRESSURE',
r'\bEngine room\b': 'ENGINE ROOM', r'\bCLR\b\b': 'CLEAR',
r'\bENG\.\b': 'ENGINE',
r'\bENG\b\b': 'ENGINE',
r'\bENGINE\b\b': 'ENGINE',
r'\bEngine speed\b\b': 'ENGINE SPEED',
r'\bEngine running\b\b': 'ENGINE RUNNING',
r'\bEngine RPM pickup\b\b': 'ENGINE RPM PICKUP',
r'\bEngine room\b\b': 'ENGINE ROOM',
# main engine
r'\bM/E\b': 'MAIN_ENGINE', r'\bM/E\b': 'MAIN_ENGINE',
r'\bME\b': 'MAIN_ENGINE', r'\bM_E\b': 'MAIN_ENGINE',
r'\bMAIN ENGINE\b': 'MAIN_ENGINE', r'\bME(?=\d|\W|$)\b': 'MAIN_ENGINE',
r'\bGen\b': 'GENERATOR_ENGINE', r'\bMAIN ENGINE\b\b': 'MAIN_ENGINE',
r'\bGE\b': 'GENERATOR_ENGINE', r'\bGen\b\b': 'GENERATOR_ENGINE',
# ensure that we substitute only for terms where following GE is num or special
r'\bGE(?=\d|\W|$)\b': 'GENERATOR_ENGINE',
r'\bG/E\b': 'GENERATOR_ENGINE', r'\bG/E\b': 'GENERATOR_ENGINE',
r'\bDG': 'GENERATOR_ENGINE', r'\bG_E\b': 'GENERATOR_ENGINE',
r'\bD/G\b': 'GENERATOR_ENGINE', r'\bDG\b': 'GENERATOR_ENGINE',
r'\bGEN\.': 'GENERATOR_ENGINE', r'\bD/G\b\b': 'GENERATOR_ENGINE',
r'\bGENERATOR ENGINE\B': 'GENERATOR_ENGINE', r'\bGEN\.\b': 'GENERATOR_ENGINE',
r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE', r'\bGENERATOR ENGINE\B\b': 'GENERATOR_ENGINE',
r'\bENGINE ROOM\b': 'ENGINE ROOM', r'\b(\d+)MGE\b\b': r'NO\1 GENERATOR_ENGINE',
r'\bE/R\b': 'ENGINE ROOM', r'\bGEN\.WIND\.TEMP\b\b': 'GENERATOR WINDING TEMPERATURE',
r'\bNO1\b': 'NO.1', r'\bENGINE ROOM\b\b': 'ENGINE ROOM',
r'\bNO\.1\b': 'NO.1', r'\bE/R\b\b': 'ENGINE ROOM',
r'\bNo\.1\b': 'NO.1', r'\bFLTR\b\b': 'FILTER',
r'\bNO2\b': 'NO.2', # marine gas oil
r'\bNO\.2\b': 'NO.2', r'\bM\.G\.O\b\b': 'MARINE GAS OIL',
r'\bNo\.2\b': 'NO.2', r'\bMGO\b\b': 'MARINE GAS OIL',
r'\bNO3\b': 'NO.3', r'\bMDO\b\b': 'MARINE DIESEL OIL',
r'\bNO\.3\b': 'NO.3', # light fuel oil
r'\bNo\.3\b': 'NO.3', r'\bL\.F\.O\b\b': 'LIGHT FUEL OIL',
r'\bNO4\b': 'NO.4', r'\bLFO\b\b': 'LIGHT FUEL OIL',
r'\bNO\.4\b': 'NO.4', # heavy fuel oil
r'\bNo\.4\b': 'NO.4', r'\bHFO\b\b': 'HEAVY FUEL OIL',
r'\bNO5\b': 'NO.5', r'\bH\.F\.O\b\b': 'HEAVY FUEL OIL',
r'\bNO\.5\b': 'NO.5', # for remaining fuel oil that couldn't be substituted
r'\bNo\.5\b': 'NO.5', r'\bF\.O\b\b': 'FUEL OIL',
r'\bFLTR\b': 'FILTER', r'\bFO\b\b': 'FUEL OIL',
r'\bLUB\.': 'LUBRICANT', # lubricant
r'\bM\.G\.O\b': 'MGO', r'\bLUB\.\b': 'LUBRICANT',
r'\bMGO\b': 'MGO', # lubricating oil
r'\bF\.O\b': 'FUEL OIL', r'\bL\.O\b\b': 'LUBRICATING OIL',
r'\bFO\b': 'FUEL OIL', r'\bLO\b\b': 'LUBRICATING OIL',
r'\bL\.T\b': 'LOW TEMPERATURE', # lubricating oil pressure
r'\bLT\b': 'LOW TEMPERATURE', r'\bLO_PRESS\b\b': 'LUBRICATING OIL PRESSURE',
r'\bH\.T\b': 'HIGH TEMPERATURE', r'\bLO_PRESSURE\b\b': 'LUBRICATING OIL PRESSURE',
r'\bHT\b': 'HIGH TEMPERATURE', # temperature
r'\bAUX\.': 'AUXILIARY', r'\bL\.T\b\b': 'LOW TEMPERATURE',
r'\bNO\.2A\b': 'NO.2A', r'\bLT\b\b': 'LOW TEMPERATURE',
r'\bNO\.2B\b': 'NO.2B', r'\bH\.T\b\b': 'HIGH TEMPERATURE',
r'\bAUX\.BOILER\b': 'AUXILIARY BOILER', r'\bHT\b\b': 'HIGH TEMPERATURE',
r'\bAUX\. BOILER\b': 'AUXILIARY BOILER', # auxiliary boiler
r'\bWIND\.': 'WINDING', # replace these first before replacing AUXILIARY only
r'\bWINDING\b': 'WINDING', r'\bAUX\.BOILER\b\b': 'AUXILIARY BOILER',
r'\bC\.S\.W\b': 'CSW', r'\bAUX\. BOILER\b\b': 'AUXILIARY BOILER',
r'\bCSW\b': 'CSW', r'\bAUX BLR\b\b': 'AUXILIARY BOILER',
r'\bVLOT\.': 'VOLTAGE', r'\bAUX\.\b': 'AUXILIARY',
r'\bVOLTAGE\b': 'VOLTAGE', # composite boiler
r'\bVOLT\.': 'VOLTAGE', r'\bCOMP\. BOILER\b\b': 'COMPOSITE BOILER',
r'\bFREQ\.': 'FREQUENCY', r'\bCOMP BOILER\b\b': 'COMPOSITE BOILER',
r'\bFREQUENCY\b': 'FREQUENCY', r'\bWIND\.\b': 'WINDING',
r'\bCURR\.': 'CURRENT', r'\bWINDING\b\b': 'WINDING',
r'\bCURRENT\b': 'CURRENT', r'\bC\.S\.W\b\b': 'CSW',
r'\bH\.F\.O\.': 'HFO', r'\bCSW\b\b': 'CSW',
r'\bTCA\b': 'TURBOCHARGER', r'\bVLOT\.\b': 'VOLTAGE',
r'\bTCB\b': 'TURBOCHARGER', r'\bVOLTAGE\b\b': 'VOLTAGE',
r'\bVOLT\.\b': 'VOLTAGE',
r'\bFREQ\.\b': 'FREQUENCY',
r'\bFREQUENCY\b\b': 'FREQUENCY',
r'\bCURR\.\b': 'CURRENT',
r'\bCURRENT\b\b': 'CURRENT',
r'\bTCA\b\b': 'TURBOCHARGER',
r'\bTCB\b\b': 'TURBOCHARGER',
r'\bT/C\b': 'TURBOCHARGER', r'\bT/C\b': 'TURBOCHARGER',
r'\bTC\b': 'TURBOCHARGER', r'\bT_C\b': 'TURBOCHARGER',
r'\bTURBOCHAGER\b': 'TURBOCHARGER', r'\bTC(?=\d|\W|$)\b': 'TURBOCHARGER',
r'\bTURBOCHARGER\b': 'TURBOCHARGER' r'\bTURBOCHAGER\b\b': 'TURBOCHARGER',
r'\bTURBOCHARGER\b\b': 'TURBOCHARGER',
# misc spelling errors
r'\bOPERATOIN\b': 'OPERATION',
# additional standardizing replacement
# replace # followed by a number with NO
r'#(?=\d)\b': 'NO',
r'\bNO\.(?=\d)\b': 'NO',
# yes, there was one with two dots - what the hell?
r'\bNO\.\.(?=\d)\b': 'NO',
r'\bNo\.(?=\d)\b': 'NO',
}
# substitution mapping for units
# Abbreviations and their replacements
unit_replacement_dict = {
r'\b%\b': 'PERCENT',
r'\b-\b': '',
r'\b- \b': '',
# ensure no character after A
r'\bA(?!\w|/)': 'CURRENT',
r'\bAmp(?!\w|/)': 'CURRENT',
r'\bHz\b': 'HERTZ',
r'\bKG/CM2\b': 'PRESSURE',
r'\bKG/H\b': 'KILOGRAM PER HOUR',
r'\bKNm\b': 'RPM',
r'\bKW\b': 'POWER',
r'\bKg(?!\w|/)': 'MASS',
r'\bKw\b': 'POWER',
r'\bL(?!\w|/)': 'VOLUME',
r'\bMT/h\b': 'METRIC TONNES PER HOUR',
r'\bMpa\b': 'PRESSURE',
r'\bPF\b': 'POWER FACTOR',
r'\bRPM\b': 'RPM',
r'\bV(?!\w|/)': 'VOLTAGE',
r'\bbar(?!\w|/)': 'PRESSURE',
r'\bbarA\b': 'SCAVENGE PRESSURE',
r'\bcST\b': 'VISCOSITY',
r'\bcSt\b': 'VISCOSITY',
r'\bcst\b': 'VISCOSITY',
r'\bdeg(?!\w|/|\.)': 'DEGREE',
r'\bdeg.C\b': 'TEMPERATURE',
r'\bdegC\b': 'TEMPERATURE',
r'\bdegree\b': 'DEGREE',
r'\bdegreeC\b': 'TEMPERATURE',
r'\bhPa\b': 'PRESSURE',
r'\bhours\b': 'HOURS',
r'\bkN\b': 'THRUST',
r'\bkNm\b': 'TORQUE',
r'\bkW\b': 'POWER',
# ensure that kg is not followed by anything
r'\bkg(?!\w|/)': 'FLOW', # somehow in the data its flow
r'\bkg/P\b': 'MASS FLOW',
r'\bkg/cm2\b': 'PRESSURE',
r'\bkg/cm²\b': 'PRESSURE',
r'\bkg/h\b': 'MASS FLOW',
r'\bkg/hr\b': 'MASS FLOW',
r'\bkg/pulse\b': '',
r'\bkgf/cm2\b': 'PRESSURE',
r'\bkgf/cm²\b': 'PRESSURE',
r'\bkgf/㎠\b': 'PRESSURE',
r'\bknots\b': 'SPEED',
r'\bkw\b': 'POWER',
r'\bl/Hr\b': 'VOLUME FLOW',
r'\bl/h\b': 'VOLUME FLOW',
r'\bl_Hr\b': 'VOLUME FLOW',
r'\bl_hr\b': 'VOLUME FLOW',
r'\bM\b': 'DRAFT', # for wind draft
r'm': 'm', # wind draft and trim - not useful
r'\bm/s\b': 'SPEED',
r'\bm3\b': 'VOLUME',
r'\bmH2O\b': 'DRAFT',
r'\bmWC\b': 'DRAFT',
r'\bmbar\b': 'PRESSURE',
r'\bmg\b': 'ACCELERATION',
r'\bmin-¹\b': '', # data too varied
r'\bmm\b': '', # data too varied
r'\bmmH2O\b': 'WATER DRUM LEVEL',
r'\brev\b': 'RPM',
r'\brpm\b': 'RPM',
r'\bx1000min-¹\b': '',
r'\b°C\b': 'TEMPERATURE',
r'\bºC\b': 'TEMPERATURE',
r'\b℃\b': 'TEMPERATURE'
} }

View File

@ -0,0 +1,58 @@
# %%
import pandas as pd
import re
import os
# Get the current working directory
current_path = os.getcwd()
print(current_path)
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
old_df = pd.read_csv(file_path)
new_df = pd.read_csv('../exports/preprocessed_data.csv')
# %%
# compare changed rows
cond = old_df['tag_description'] != new_df['tag_description']
val1 = old_df[cond]['tag_description']
val2 = new_df[cond]['tag_description']
df = pd.DataFrame({
'column1': val1,
'column2': val2
})
df.to_csv('desc.csv')
# %%
# compare changed rows
cond = old_df['unit'] != new_df['unit']
val1 = old_df[cond]['unit']
val2 = new_df[cond]['unit']
df = pd.DataFrame({
'column1': val1,
'column2': val2
})
df.to_csv('unit.csv')
# %%
set(val2)
# %%
desc_set = list(set(df[df['MDM']]['tag_description']))
with open('output.txt', 'w') as file:
print(desc_set, file=file)
# %%
test = 'kg/cm3'
print(re.sub(r'kg(?!\w|/)', 'flow', test))
# %%

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -16,9 +16,6 @@ from transformers import (
AutoTokenizer, AutoTokenizer,
AutoModelForSequenceClassification, AutoModelForSequenceClassification,
DataCollatorWithPadding, DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments
) )
import evaluate import evaluate
import numpy as np import numpy as np
@ -56,13 +53,15 @@ def process_df_to_dict(df, mdm_list):
output_list = [] output_list = []
for _, row in df.iterrows(): for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>" desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
pattern = row['pattern'] pattern = row['pattern']
try: try:
index = mdm_list.index(pattern) index = mdm_list.index(pattern)
except ValueError: except ValueError:
index = -1 index = -1
element = { element = {
'text' : f"{desc}", 'text' : f"{desc}{unit}",
'label': index, 'label': index,
} }
output_list.append(element) output_list.append(element)
@ -84,8 +83,7 @@ def create_dataset(fold, mdm_list):
# %% # %%
# function to perform training for a given fold # function to perform training for a given fold
# def train(fold): def test(fold):
fold = 1
test_dataset = create_dataset(fold, mdm_list) test_dataset = create_dataset(fold, mdm_list)
@ -100,9 +98,9 @@ model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens # Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer # Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# %% # %%
# compute max token length # compute max token length
@ -219,10 +217,12 @@ precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro') recall = recall_score(y_true, y_pred, average='macro')
# Print the results # Print the results
print(f'Accuracy: {accuracy:.2f}') print(f'Accuracy: {accuracy:.5f}')
print(f'F1 Score: {f1:.2f}') print(f'F1 Score: {f1:.5f}')
print(f'Precision: {precision:.2f}') print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.2f}') print(f'Recall: {recall:.5f}')
# %% # %%
for fold in [1,2,3,4,5]:
test(fold)

View File

@ -53,13 +53,15 @@ def process_df_to_dict(df, mdm_list):
output_list = [] output_list = []
for _, row in df.iterrows(): for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>" desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
pattern = row['pattern'] pattern = row['pattern']
try: try:
index = mdm_list.index(pattern) index = mdm_list.index(pattern)
except ValueError: except ValueError:
index = -1 index = -1
element = { element = {
'text' : f"{desc}", 'text' : f"{desc}{unit}",
'label': index, 'label': index,
} }
output_list.append(element) output_list.append(element)
@ -69,7 +71,7 @@ def process_df_to_dict(df, mdm_list):
def create_split_dataset(fold, mdm_list): def create_split_dataset(fold, mdm_list):
# train # train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv" data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True) train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid # valid
@ -86,8 +88,7 @@ def create_split_dataset(fold, mdm_list):
# %% # %%
# function to perform training for a given fold # function to perform training for a given fold
# def train(fold): def train(fold):
fold = 1
save_path = f'checkpoint_fold_{fold}' save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold, mdm_list) split_datasets = create_split_dataset(fold, mdm_list)
@ -97,9 +98,9 @@ split_datasets = create_split_dataset(fold, mdm_list)
model_checkpoint = "distilbert/distilbert-base-uncased" model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens # Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"] additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer # Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120 max_length = 120
@ -127,8 +128,6 @@ tokenized_datasets = split_datasets.map(
# %% temp # %% temp
# tokenized_datasets['train'].rename_columns() # tokenized_datasets['train'].rename_columns()
# %% temp
tokenized_datasets['train']['input_ids']
# %% # %%
# create data collator # create data collator
@ -166,19 +165,20 @@ model.resize_token_embeddings(len(tokenizer))
training_args = TrainingArguments( training_args = TrainingArguments(
output_dir=f"{save_path}", output_dir=f"{save_path}",
eval_strategy="epoch", # eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log", logging_dir="tensorboard-log",
logging_strategy="epoch", logging_strategy="epoch",
save_strategy="epoch", # save_strategy="epoch",
load_best_model_at_end=True, load_best_model_at_end=False,
learning_rate=2e-5, learning_rate=1e-5,
per_device_train_batch_size=64, per_device_train_batch_size=64,
per_device_eval_batch_size=64, per_device_eval_batch_size=64,
auto_find_batch_size=False, auto_find_batch_size=False,
ddp_find_unused_parameters=False, ddp_find_unused_parameters=False,
weight_decay=0.01, weight_decay=0.01,
save_total_limit=1, save_total_limit=1,
num_train_epochs=40, num_train_epochs=80,
bf16=True, bf16=True,
push_to_hub=False, push_to_hub=False,
remove_unused_columns=False, remove_unused_columns=False,
@ -202,10 +202,10 @@ trainer = Trainer(
trainer.train() trainer.train()
# # execute training # execute training
# for fold in [1,2,3,4,5]: for fold in [1,2,3,4,5]:
# print(fold) print(fold)
# train(fold) train(fold)
# %% # %%

View File

@ -52,11 +52,17 @@ class Inference():
print("preparing dataloader") print("preparing dataloader")
# convert each dataframe row into a dictionary # convert each dataframe row into a dictionary
# outputs a list of dictionaries # outputs a list of dictionaries
def _process_df(df): def _process_df(df):
output_list = [{ output_list = []
'input': f"<DESC>{row['tag_description']}<DESC>", for _, row in df.iterrows():
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", desc = f"<DESC>{row['tag_description']}<DESC>"
} for _, row in df.iterrows()] unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
}
output_list.append(element)
return output_list return output_list

View File

@ -1,6 +1,6 @@
Accuracy for fold 1: 0.943208707998107 Accuracy for fold 1: 0.9687647893989588
Accuracy for fold 2: 0.9214953271028037 Accuracy for fold 2: 0.9565420560747664
Accuracy for fold 3: 0.9728915662650602 Accuracy for fold 3: 0.9708835341365462
Accuracy for fold 4: 0.967174119885823 Accuracy for fold 4: 0.9881065651760228
Accuracy for fold 5: 0.9097572148419606 Accuracy for fold 5: 0.9225836005497022

View File

@ -32,8 +32,9 @@ def process_df_to_dict(df):
output_list = [] output_list = []
for _, row in df.iterrows(): for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>" desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = { element = {
'input' : f"{desc}", 'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>", 'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
} }
output_list.append(element) output_list.append(element)
@ -43,7 +44,7 @@ def process_df_to_dict(df):
def create_split_dataset(fold): def create_split_dataset(fold):
# train # train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv" data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True) train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid # valid
@ -150,11 +151,12 @@ def train(fold):
args = Seq2SeqTrainingArguments( args = Seq2SeqTrainingArguments(
f"{save_path}", f"{save_path}",
eval_strategy="epoch", # eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log", logging_dir="tensorboard-log",
logging_strategy="epoch", logging_strategy="epoch",
save_strategy="epoch", # save_strategy="epoch",
load_best_model_at_end=True, load_best_model_at_end=False,
learning_rate=1e-3, learning_rate=1e-3,
per_device_train_batch_size=64, per_device_train_batch_size=64,
per_device_eval_batch_size=64, per_device_eval_batch_size=64,
@ -162,7 +164,7 @@ def train(fold):
ddp_find_unused_parameters=False, ddp_find_unused_parameters=False,
weight_decay=0.01, weight_decay=0.01,
save_total_limit=1, save_total_limit=1,
num_train_epochs=20, num_train_epochs=40,
predict_with_generate=True, predict_with_generate=True,
bf16=True, bf16=True,
push_to_hub=False, push_to_hub=False,