Feat: added abbreviation expansion rules

This commit is contained in:
Richard Wong 2024-11-10 20:28:47 +09:00
parent 59bbf1f403
commit 2b5994cb52
21 changed files with 152503 additions and 370 deletions

1
analysis/bert/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

View File

@ -0,0 +1,285 @@
# %%
import pandas as pd
from utils import Retriever, cosine_similarity_chunked
import os
import glob
import numpy as np
# %%
fold = 5
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
print(len(error_thing_df))
print(len(error_property_df))
# %%
# thing_df.to_html('thing_errors.html')
# property_df.to_html('property_errors.html')
##########################################
# what we need now is understand why the model is making these mispredictions
# import train data and test data
# %%
class Embedder():
input_df: pd.DataFrame
fold: int
def __init__(self, input_df):
self.input_df = input_df
def make_embedding(self, checkpoint_path):
def generate_input_list(df):
input_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = f"{desc}{unit}"
input_list.append(element)
return input_list
# prepare reference embed
train_data = list(generate_input_list(self.input_df))
# Define the directory and the pattern
retriever_train = Retriever(train_data, checkpoint_path)
retriever_train.make_embedding(batch_size=64)
return retriever_train.embeddings.to('cpu')
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
checkpoint_directory = "../../train/classification_bert"
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
train_embedder = Embedder(input_df=train_df)
train_embeds = train_embedder.make_embedding(checkpoint_path)
test_embedder = Embedder(input_df=test_df)
test_embeds = test_embedder.make_embedding(checkpoint_path)
# %%
# test embeds are inputs since we are looking back at train data
cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
# %%
# the following function takes in a full cos_sim_matrix
# condition_source: boolean selectors of the source embedding
# condition_target: boolean selectors of the target embedding
def find_closest(cos_sim_matrix, condition_source, condition_target):
# subset_matrix = cos_sim_matrix[condition_source]
# except we are subsetting 2D matrix (row, column)
subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
# we select top k here
# Get the indices of the top 5 maximum values along axis 1
top_k = 3
top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values
# note that top_k_indices is a nested list because of the 2d nature of the matrix
# the result is flipped
top_k_indices[0] = top_k_indices[0][::-1]
# Get the values of the top 5 maximum scores
top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
return top_k_indices, top_k_values
####################################################
# special find-back code
# %%
def find_back_element_with_print(select_idx):
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
top_k_indices, top_k_values = find_closest(
cos_sim_matrix=cos_sim_matrix,
condition_source=condition_source,
condition_target=condition_target)
training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
predicted_test_data = predicted_test_data.to_list()[0]
print("*" * 80)
print("idx:", select_idx)
print("train desc", training_desc_list)
print("train thing+property", training_data_pattern_list)
print("test desc", test_desc_list)
print("test thing+property", test_data_pattern_list)
print("predicted thing+property", predicted_test_data)
print("ships idx", test_ship_id)
print("score:", top_k_values[0])
test_pattern = test_data_pattern_list[0]
find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
if sum(find_back_list) > 0:
return True
else:
return False
# %%
def find_back_element(select_idx):
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
top_k_indices, top_k_values = find_closest(
cos_sim_matrix=cos_sim_matrix,
condition_source=condition_source,
condition_target=condition_target)
training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
# print(training_data_pattern_list)
# print(test_data_pattern_list)
test_pattern = test_data_pattern_list[0]
find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
if sum(find_back_list) > 0:
return True
else:
return False
# %%
# for error thing
pattern_in_train = []
for select_idx in error_thing_df.index:
result = find_back_element_with_print(select_idx)
print("status:", result)
pattern_in_train.append(result)
sum(pattern_in_train)/len(pattern_in_train)
###
# for error property
# %%
pattern_in_train = []
for select_idx in error_property_df.index:
result = find_back_element_with_print(select_idx)
print("status:", result)
pattern_in_train.append(result)
sum(pattern_in_train)/len(pattern_in_train)
####################################################
# %%
# make function to compute similarity of closest retrieved result
def compute_similarity(select_idx):
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
top_k_indices, top_k_values = find_closest(
cos_sim_matrix=cos_sim_matrix,
condition_source=condition_source,
condition_target=condition_target)
return np.mean(top_k_values[0])
# %%
def print_summary(similarity_scores):
# Convert list to numpy array for additional stats
np_array = np.array(similarity_scores)
# Get stats
mean_value = np.mean(np_array)
percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles
# Display numpy results
print("Mean:", mean_value)
print("25th, 50th, 75th Percentiles:", percentiles)
# %%
##########################################
# Analyze the degree of similarity differences between correct and incorrect results
# %%
# compute similarity scores for all values in error_thing_df
similarity_thing_scores = []
for idx in error_thing_df.index:
similarity_thing_scores.append(compute_similarity(idx))
print_summary(similarity_thing_scores)
# %%
similarity_property_scores = []
for idx in error_property_df.index:
similarity_property_scores.append(compute_similarity(idx))
print_summary(similarity_property_scores)
# %%
similarity_correct_scores = []
for idx in correct_df.index:
similarity_correct_scores.append(compute_similarity(idx))
print_summary(similarity_correct_scores)
# %%
import matplotlib.pyplot as plt
# Sample data
list1 = similarity_thing_scores
list2 = similarity_property_scores
list3 = similarity_correct_scores
# Plot histograms
bins = 50
plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
# Labels and legend
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(loc='upper right')
plt.title('Histograms of Three Lists')
# Show plot
plt.show()
# %%

71
analysis/bert/utils.py Normal file
View File

@ -0,0 +1,71 @@
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
)
import torch.nn.functional as F
class Retriever:
def __init__(self, input_texts, model_checkpoint):
# we need to generate the embedding from list of input strings
self.embeddings = []
self.inputs = input_texts
model_checkpoint = model_checkpoint
self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
model.to(self.device)
self.model = model.eval()
def make_embedding(self, batch_size=64):
all_embeddings = self.embeddings
input_texts = self.inputs
for i in range(0, len(input_texts), batch_size):
batch_texts = input_texts[i:i+batch_size]
# Tokenize the input text
inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
input_ids = inputs.input_ids.to(self.device)
attention_mask = inputs.attention_mask.to(self.device)
# Pass the input through the encoder and retrieve the embeddings
with torch.no_grad():
encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
# get last layer
embeddings = encoder_outputs.hidden_states[-1]
# get cls token embedding
cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size)
all_embeddings.append(cls_embeddings)
# remove the batch list and makes a single large tensor, dim=0 increases row-wise
all_embeddings = torch.cat(all_embeddings, dim=0)
self.embeddings = all_embeddings
def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
batch1_size = batch1.size(0)
batch2_size = batch2.size(0)
# Prepare an empty tensor to store results
cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
# Process batch1 in chunks
for i in range(0, batch1_size, chunk_size):
batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1
# Expand batch1 chunk and entire batch2 for comparison
batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len)
batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len)
# Compute cosine similarity for the chunk and store it in the final tensor
cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
return cos_sim

View File

@ -0,0 +1,63 @@
# we want to compare the labels between the train data and test data
# %%
import pandas as pd
#########################
# experiment 1
#############
# 1 import test data
# %%
fold = 1
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
##########################
# 2 import train data
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path)
train_pattern = train_df['pattern']
# %%
test_pattern_set = set(test_pattern)
train_pattern_set = set(train_pattern)
# %%
# use this to get labels in test not found in training data
test_pattern_set - train_pattern_set
# verdict: we see that FOMassFlowTotal is not found in the training set
# hence it is not possible for this to be classified correctly
###################################
# experiment 2
# %%
# we want to check load and loadpercent
test_df[test_df['property_pattern'] == 'Load']
# %%
test_df[test_df['property_pattern'] == 'LoadPercent']
# verdict: we see that the units column determine what this should be
# in order to not disturb the model, we should chuck it in to post-process

View File

@ -0,0 +1,77 @@
# we want to compare the labels between the train data and test data
# %%
import pandas as pd
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
df = pd.read_csv(file_path)
df = df[df['MDM']]
# %%
unit_list = df['unit']
unit_list = [elem if (isinstance(elem, str)) else '' for elem in unit_list]
print(sorted(list(set(unit_list))))
# %%
test = ''
# df[df['unit'] == test]['property_pattern'].to_list()
df[df['unit'] == test]
#############
# 1 import test data
# %%
fold = 1
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
##########################
# 2 import train data
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path)
train_pattern = train_df['pattern']
# %%
test_pattern_set = set(test_pattern)
train_pattern_set = set(train_pattern)
# %%
# use this to get labels in test not found in training data
test_pattern_set - train_pattern_set
# verdict: we see that FOMassFlowTotal is not found in the training set
# hence it is not possible for this to be classified correctly
###################################
# experiment 2
# %%
# we want to check load and loadpercent
test_df[test_df['property_pattern'] == 'Load']
# %%
test_df[test_df['property_pattern'] == 'LoadPercent']
#
set(df['unit'])
# %%

View File

@ -2,6 +2,8 @@
Perform substitutions on common terms to standardize abbreviations.
- abbreviations_replacer.py: replaces abbreviations with full terms
## Instructions:
- `python abbreviations_replacer.py`

View File

@ -5,7 +5,7 @@ Modified by: Richard Wong
# %%
import re
import pandas as pd
from replacement_dict import replacement_dict
from replacement_dict import desc_replacement_dict, unit_replacement_dict
# %%
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
@ -24,6 +24,22 @@ def replace_abbreviations(tag_descriptions, abbreviations):
replaced_descriptions.append(description)
return replaced_descriptions
def cleanup_spaces(tag_descriptions):
# Replace all whitespace with a single space
replaced_descriptions = []
for description in tag_descriptions:
description_clean = re.sub(r'\s+', ' ', description)
replaced_descriptions.append(description_clean)
return replaced_descriptions
# remove all dots
def cleanup_dots(tag_descriptions):
replaced_descriptions = []
for description in tag_descriptions:
description_clean = re.sub(r'\.', '', description)
replaced_descriptions.append(description_clean)
return replaced_descriptions
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
@ -31,16 +47,32 @@ df = pd.read_csv(file_path)
# %%
# Replace abbreviations
print("running substitution")
print("running substitution for descriptions")
df['tag_description']= df['tag_description'].fillna("NOVALUE")
# Replace whitespace-only entries with "NOVALUE"
# note that "N/A" can be read as nan
# replace whitespace only values as NOVALUE
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
tag_descriptions = df['tag_description']
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict)
replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
replaced_descriptions = cleanup_spaces(replaced_descriptions)
replaced_descriptions = cleanup_dots(replaced_descriptions)
df["tag_description"] = replaced_descriptions
# print("Descriptions after replacement:", replaced_descriptions)
# strip trailing whitespace
df['tag_description'] = df['tag_description'].str.rstrip()
df['tag_description'] = df['tag_description'].str.upper()
# %%
df["tag_description"] = replaced_descriptions
print("running substitutions for units")
df['unit'] = df['unit'].fillna("NOVALUE")
df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
unit_list = df['unit']
new_unit = replace_abbreviations(unit_list, unit_replacement_dict)
new_unit = cleanup_spaces(new_unit)
df['unit'] = new_unit
# save
df.to_csv("../exports/preprocessed_data.csv", index=False)
print("file saved")

View File

@ -1,113 +1,210 @@
"""
Author: Daniel Kim
"""
# substitution mapping for descriptions
# Abbreviations and their replacements
replacement_dict = {
r'\bLIST\b': 'LIST',
r'\bList\b': 'LIST',
r'\bEXH\.': 'EXHAUST',
r'\bEXH\b': 'EXHAUST',
r'\bEXHAUST\.': 'EXHAUST',
r'\bExhaust\b': 'EXHAUST',
r'\bEXHAUST\b': 'EXHAUST',
r'\bTEMP\.': 'TEMPERATURE',
r'\bTEMP\b': 'TEMPERATURE',
r'\bTEMPERATURE\.': 'TEMPERATURE',
r'\bTEMPERATURE\b': 'TEMPERATURE',
r'\bW\.': 'WATER',
r'\bWATER\b': 'WATER',
r'\bCYL\.': 'CYLINDER',
r'\bcyl\.': 'CYLINDER',
r'\bCYL\b': 'CYLINDER',
r'\bcylinder\b': 'CYLINDER',
r'\bCYLINDER\b': 'CYLINDER',
r'\bCOOL\.': 'COOLING',
r'\bcool\.': 'COOLING',
r'\bcooling\b': 'COOLING',
r'\bCOOLING\b': 'COOLING',
r'\bcooler\b': 'COOLER',
r'\bCOOLER\b': 'COOLER',
r'\bScav\.': 'SCAVENGE',
r'\bSCAV\.': 'SCAVENGE',
r'\bINL\.': 'INLET',
r'\binlet\b': 'INLET',
r'\bINLET\b': 'INLET',
r'\bOUT\.': 'OUTLET',
r'\bOUTL\.': 'OUTLET',
r'\boutlet\b': 'OUTLET',
r'\bOUTLET\b': 'OUTLET',
r'\bPRESS\.': 'PRESSURE',
r'\bPress\.': 'PRESSURE',
r'\bpressure\b': 'PRESSURE',
r'\bPRESSURE\b': 'PRESSURE',
r'\bCLR\b': 'CLEAR',
r'\bENG\.': 'ENGINE',
r'\bENG\b': 'ENGINE',
r'\bENGINE\b': 'ENGINE',
r'\bEngine speed\b': 'ENGINE SPEED',
r'\bEngine running\b': 'ENGINE RUNNING',
r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP',
r'\bEngine room\b': 'ENGINE ROOM',
desc_replacement_dict = {
r'\bLIST\b\b': 'LIST',
r'\bList\b\b': 'LIST',
r'\bEXH\.\b': 'EXHAUST',
r'\bEXH\b\b': 'EXHAUST',
r'\bEXHAUST\.\b': 'EXHAUST',
r'\bExhaust\b\b': 'EXHAUST',
r'\bEXHAUST\b\b': 'EXHAUST',
r'\bTEMP\.\b': 'TEMPERATURE',
r'\bTEMP\b\b': 'TEMPERATURE',
r'\bTEMPERATURE\.\b': 'TEMPERATURE',
r'\bTEMPERATURE\b\b': 'TEMPERATURE',
r'\bW\.\b': 'WATER',
r'\bWATER\b\b': 'WATER',
r'\bCW\b\b': 'COOLING WATER',
r'\bCYL\.\b': 'CYLINDER',
r'\bCyl\b\b': 'CYLINDER',
r'\bcyl\.\b': 'CYLINDER',
r'\bCYL\b\b': 'CYLINDER',
r'\bCYL(?=\d|\W|$)\b': 'CYLINDER',
r'\bcylinder\b\b': 'CYLINDER',
r'\bCYLINDER\b\b': 'CYLINDER',
r'\bCOOL\.\b': 'COOLING',
r'\bcool\.\b': 'COOLING',
r'\bcooling\b\b': 'COOLING',
r'\bCOOLING\b\b': 'COOLING',
r'\bcooler\b\b': 'COOLER',
r'\bCOOLER\b\b': 'COOLER',
r'\bScav\.\b': 'SCAVENGE',
r'\bSCAV\.\b': 'SCAVENGE',
r'\bINL\.\b': 'INLET',
r'\binlet\b\b': 'INLET',
r'\bINLET\b\b': 'INLET',
r'\bOUT\.\b': 'OUTLET',
r'\bOUTL\.\b': 'OUTLET',
r'\boutlet\b\b': 'OUTLET',
r'\bOUTLET\b\b': 'OUTLET',
# pressure
r'\bPRESS\b\b': 'PRESSURE',
r'\bPRESS\.\b': 'PRESSURE',
r'\bPress\.\b': 'PRESSURE',
r'\bpressure\b\b': 'PRESSURE',
r'\bPRESSURE\b\b': 'PRESSURE',
# this is a special replacement - it is safe to replace PRS w/o checks
r'PRS\b': 'PRESSURE',
r'\bCLR\b\b': 'CLEAR',
r'\bENG\.\b': 'ENGINE',
r'\bENG\b\b': 'ENGINE',
r'\bENGINE\b\b': 'ENGINE',
r'\bEngine speed\b\b': 'ENGINE SPEED',
r'\bEngine running\b\b': 'ENGINE RUNNING',
r'\bEngine RPM pickup\b\b': 'ENGINE RPM PICKUP',
r'\bEngine room\b\b': 'ENGINE ROOM',
# main engine
r'\bM/E\b': 'MAIN_ENGINE',
r'\bME\b': 'MAIN_ENGINE',
r'\bMAIN ENGINE\b': 'MAIN_ENGINE',
r'\bGen\b': 'GENERATOR_ENGINE',
r'\bGE\b': 'GENERATOR_ENGINE',
r'\bM_E\b': 'MAIN_ENGINE',
r'\bME(?=\d|\W|$)\b': 'MAIN_ENGINE',
r'\bMAIN ENGINE\b\b': 'MAIN_ENGINE',
r'\bGen\b\b': 'GENERATOR_ENGINE',
# ensure that we substitute only for terms where following GE is num or special
r'\bGE(?=\d|\W|$)\b': 'GENERATOR_ENGINE',
r'\bG/E\b': 'GENERATOR_ENGINE',
r'\bDG': 'GENERATOR_ENGINE',
r'\bD/G\b': 'GENERATOR_ENGINE',
r'\bGEN\.': 'GENERATOR_ENGINE',
r'\bGENERATOR ENGINE\B': 'GENERATOR_ENGINE',
r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
r'\bENGINE ROOM\b': 'ENGINE ROOM',
r'\bE/R\b': 'ENGINE ROOM',
r'\bNO1\b': 'NO.1',
r'\bNO\.1\b': 'NO.1',
r'\bNo\.1\b': 'NO.1',
r'\bNO2\b': 'NO.2',
r'\bNO\.2\b': 'NO.2',
r'\bNo\.2\b': 'NO.2',
r'\bNO3\b': 'NO.3',
r'\bNO\.3\b': 'NO.3',
r'\bNo\.3\b': 'NO.3',
r'\bNO4\b': 'NO.4',
r'\bNO\.4\b': 'NO.4',
r'\bNo\.4\b': 'NO.4',
r'\bNO5\b': 'NO.5',
r'\bNO\.5\b': 'NO.5',
r'\bNo\.5\b': 'NO.5',
r'\bFLTR\b': 'FILTER',
r'\bLUB\.': 'LUBRICANT',
r'\bM\.G\.O\b': 'MGO',
r'\bMGO\b': 'MGO',
r'\bF\.O\b': 'FUEL OIL',
r'\bFO\b': 'FUEL OIL',
r'\bL\.T\b': 'LOW TEMPERATURE',
r'\bLT\b': 'LOW TEMPERATURE',
r'\bH\.T\b': 'HIGH TEMPERATURE',
r'\bHT\b': 'HIGH TEMPERATURE',
r'\bAUX\.': 'AUXILIARY',
r'\bNO\.2A\b': 'NO.2A',
r'\bNO\.2B\b': 'NO.2B',
r'\bAUX\.BOILER\b': 'AUXILIARY BOILER',
r'\bAUX\. BOILER\b': 'AUXILIARY BOILER',
r'\bWIND\.': 'WINDING',
r'\bWINDING\b': 'WINDING',
r'\bC\.S\.W\b': 'CSW',
r'\bCSW\b': 'CSW',
r'\bVLOT\.': 'VOLTAGE',
r'\bVOLTAGE\b': 'VOLTAGE',
r'\bVOLT\.': 'VOLTAGE',
r'\bFREQ\.': 'FREQUENCY',
r'\bFREQUENCY\b': 'FREQUENCY',
r'\bCURR\.': 'CURRENT',
r'\bCURRENT\b': 'CURRENT',
r'\bH\.F\.O\.': 'HFO',
r'\bTCA\b': 'TURBOCHARGER',
r'\bTCB\b': 'TURBOCHARGER',
r'\bG_E\b': 'GENERATOR_ENGINE',
r'\bDG\b': 'GENERATOR_ENGINE',
r'\bD/G\b\b': 'GENERATOR_ENGINE',
r'\bGEN\.\b': 'GENERATOR_ENGINE',
r'\bGENERATOR ENGINE\B\b': 'GENERATOR_ENGINE',
r'\b(\d+)MGE\b\b': r'NO\1 GENERATOR_ENGINE',
r'\bGEN\.WIND\.TEMP\b\b': 'GENERATOR WINDING TEMPERATURE',
r'\bENGINE ROOM\b\b': 'ENGINE ROOM',
r'\bE/R\b\b': 'ENGINE ROOM',
r'\bFLTR\b\b': 'FILTER',
# marine gas oil
r'\bM\.G\.O\b\b': 'MARINE GAS OIL',
r'\bMGO\b\b': 'MARINE GAS OIL',
r'\bMDO\b\b': 'MARINE DIESEL OIL',
# light fuel oil
r'\bL\.F\.O\b\b': 'LIGHT FUEL OIL',
r'\bLFO\b\b': 'LIGHT FUEL OIL',
# heavy fuel oil
r'\bHFO\b\b': 'HEAVY FUEL OIL',
r'\bH\.F\.O\b\b': 'HEAVY FUEL OIL',
# for remaining fuel oil that couldn't be substituted
r'\bF\.O\b\b': 'FUEL OIL',
r'\bFO\b\b': 'FUEL OIL',
# lubricant
r'\bLUB\.\b': 'LUBRICANT',
# lubricating oil
r'\bL\.O\b\b': 'LUBRICATING OIL',
r'\bLO\b\b': 'LUBRICATING OIL',
# lubricating oil pressure
r'\bLO_PRESS\b\b': 'LUBRICATING OIL PRESSURE',
r'\bLO_PRESSURE\b\b': 'LUBRICATING OIL PRESSURE',
# temperature
r'\bL\.T\b\b': 'LOW TEMPERATURE',
r'\bLT\b\b': 'LOW TEMPERATURE',
r'\bH\.T\b\b': 'HIGH TEMPERATURE',
r'\bHT\b\b': 'HIGH TEMPERATURE',
# auxiliary boiler
# replace these first before replacing AUXILIARY only
r'\bAUX\.BOILER\b\b': 'AUXILIARY BOILER',
r'\bAUX\. BOILER\b\b': 'AUXILIARY BOILER',
r'\bAUX BLR\b\b': 'AUXILIARY BOILER',
r'\bAUX\.\b': 'AUXILIARY',
# composite boiler
r'\bCOMP\. BOILER\b\b': 'COMPOSITE BOILER',
r'\bCOMP BOILER\b\b': 'COMPOSITE BOILER',
r'\bWIND\.\b': 'WINDING',
r'\bWINDING\b\b': 'WINDING',
r'\bC\.S\.W\b\b': 'CSW',
r'\bCSW\b\b': 'CSW',
r'\bVLOT\.\b': 'VOLTAGE',
r'\bVOLTAGE\b\b': 'VOLTAGE',
r'\bVOLT\.\b': 'VOLTAGE',
r'\bFREQ\.\b': 'FREQUENCY',
r'\bFREQUENCY\b\b': 'FREQUENCY',
r'\bCURR\.\b': 'CURRENT',
r'\bCURRENT\b\b': 'CURRENT',
r'\bTCA\b\b': 'TURBOCHARGER',
r'\bTCB\b\b': 'TURBOCHARGER',
r'\bT/C\b': 'TURBOCHARGER',
r'\bTC\b': 'TURBOCHARGER',
r'\bTURBOCHAGER\b': 'TURBOCHARGER',
r'\bTURBOCHARGER\b': 'TURBOCHARGER'
r'\bT_C\b': 'TURBOCHARGER',
r'\bTC(?=\d|\W|$)\b': 'TURBOCHARGER',
r'\bTURBOCHAGER\b\b': 'TURBOCHARGER',
r'\bTURBOCHARGER\b\b': 'TURBOCHARGER',
# misc spelling errors
r'\bOPERATOIN\b': 'OPERATION',
# additional standardizing replacement
# replace # followed by a number with NO
r'#(?=\d)\b': 'NO',
r'\bNO\.(?=\d)\b': 'NO',
# yes, there was one with two dots - what the hell?
r'\bNO\.\.(?=\d)\b': 'NO',
r'\bNo\.(?=\d)\b': 'NO',
}
# substitution mapping for units
# Abbreviations and their replacements
unit_replacement_dict = {
r'\b%\b': 'PERCENT',
r'\b-\b': '',
r'\b- \b': '',
# ensure no character after A
r'\bA(?!\w|/)': 'CURRENT',
r'\bAmp(?!\w|/)': 'CURRENT',
r'\bHz\b': 'HERTZ',
r'\bKG/CM2\b': 'PRESSURE',
r'\bKG/H\b': 'KILOGRAM PER HOUR',
r'\bKNm\b': 'RPM',
r'\bKW\b': 'POWER',
r'\bKg(?!\w|/)': 'MASS',
r'\bKw\b': 'POWER',
r'\bL(?!\w|/)': 'VOLUME',
r'\bMT/h\b': 'METRIC TONNES PER HOUR',
r'\bMpa\b': 'PRESSURE',
r'\bPF\b': 'POWER FACTOR',
r'\bRPM\b': 'RPM',
r'\bV(?!\w|/)': 'VOLTAGE',
r'\bbar(?!\w|/)': 'PRESSURE',
r'\bbarA\b': 'SCAVENGE PRESSURE',
r'\bcST\b': 'VISCOSITY',
r'\bcSt\b': 'VISCOSITY',
r'\bcst\b': 'VISCOSITY',
r'\bdeg(?!\w|/|\.)': 'DEGREE',
r'\bdeg.C\b': 'TEMPERATURE',
r'\bdegC\b': 'TEMPERATURE',
r'\bdegree\b': 'DEGREE',
r'\bdegreeC\b': 'TEMPERATURE',
r'\bhPa\b': 'PRESSURE',
r'\bhours\b': 'HOURS',
r'\bkN\b': 'THRUST',
r'\bkNm\b': 'TORQUE',
r'\bkW\b': 'POWER',
# ensure that kg is not followed by anything
r'\bkg(?!\w|/)': 'FLOW', # somehow in the data its flow
r'\bkg/P\b': 'MASS FLOW',
r'\bkg/cm2\b': 'PRESSURE',
r'\bkg/cm²\b': 'PRESSURE',
r'\bkg/h\b': 'MASS FLOW',
r'\bkg/hr\b': 'MASS FLOW',
r'\bkg/pulse\b': '',
r'\bkgf/cm2\b': 'PRESSURE',
r'\bkgf/cm²\b': 'PRESSURE',
r'\bkgf/㎠\b': 'PRESSURE',
r'\bknots\b': 'SPEED',
r'\bkw\b': 'POWER',
r'\bl/Hr\b': 'VOLUME FLOW',
r'\bl/h\b': 'VOLUME FLOW',
r'\bl_Hr\b': 'VOLUME FLOW',
r'\bl_hr\b': 'VOLUME FLOW',
r'\bM\b': 'DRAFT', # for wind draft
r'm': 'm', # wind draft and trim - not useful
r'\bm/s\b': 'SPEED',
r'\bm3\b': 'VOLUME',
r'\bmH2O\b': 'DRAFT',
r'\bmWC\b': 'DRAFT',
r'\bmbar\b': 'PRESSURE',
r'\bmg\b': 'ACCELERATION',
r'\bmin-¹\b': '', # data too varied
r'\bmm\b': '', # data too varied
r'\bmmH2O\b': 'WATER DRUM LEVEL',
r'\brev\b': 'RPM',
r'\brpm\b': 'RPM',
r'\bx1000min-¹\b': '',
r'\b°C\b': 'TEMPERATURE',
r'\bºC\b': 'TEMPERATURE',
r'\b℃\b': 'TEMPERATURE'
}

View File

@ -0,0 +1,58 @@
# %%
import pandas as pd
import re
import os
# Get the current working directory
current_path = os.getcwd()
print(current_path)
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
old_df = pd.read_csv(file_path)
new_df = pd.read_csv('../exports/preprocessed_data.csv')
# %%
# compare changed rows
cond = old_df['tag_description'] != new_df['tag_description']
val1 = old_df[cond]['tag_description']
val2 = new_df[cond]['tag_description']
df = pd.DataFrame({
'column1': val1,
'column2': val2
})
df.to_csv('desc.csv')
# %%
# compare changed rows
cond = old_df['unit'] != new_df['unit']
val1 = old_df[cond]['unit']
val2 = new_df[cond]['unit']
df = pd.DataFrame({
'column1': val1,
'column2': val2
})
df.to_csv('unit.csv')
# %%
set(val2)
# %%
desc_set = list(set(df[df['MDM']]['tag_description']))
with open('output.txt', 'w') as file:
print(desc_set, file=file)
# %%
test = 'kg/cm3'
print(re.sub(r'kg(?!\w|/)', 'flow', test))
# %%

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -16,9 +16,6 @@ from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments
)
import evaluate
import numpy as np
@ -56,13 +53,15 @@ def process_df_to_dict(df, mdm_list):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
pattern = row['pattern']
try:
index = mdm_list.index(pattern)
except ValueError:
index = -1
element = {
'text' : f"{desc}",
'text' : f"{desc}{unit}",
'label': index,
}
output_list.append(element)
@ -84,145 +83,146 @@ def create_dataset(fold, mdm_list):
# %%
# function to perform training for a given fold
# def train(fold):
fold = 1
def test(fold):
test_dataset = create_dataset(fold, mdm_list)
test_dataset = create_dataset(fold, mdm_list)
# prepare tokenizer
# prepare tokenizer
checkpoint_directory = f'../checkpoint_fold_{fold}'
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
checkpoint_directory = f'../checkpoint_fold_{fold}'
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# %%
# compute max token length
max_length = 0
for sample in test_dataset['text']:
# Tokenize the sample and get the length
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
length = len(input_ids)
# %%
# compute max token length
max_length = 0
for sample in test_dataset['text']:
# Tokenize the sample and get the length
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
length = len(input_ids)
# Update max_length if this sample is longer
if length > max_length:
max_length = length
# Update max_length if this sample is longer
if length > max_length:
max_length = length
print(max_length)
print(max_length)
# %%
# %%
max_length = 64
max_length = 64
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
# truncation=True,
padding='max_length'
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
# truncation=True,
padding='max_length'
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# %% temp
# tokenized_datasets['train'].rename_columns()
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
# %%
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
# %%
# compute metrics
# metric = evaluate.load("accuracy")
#
#
# def compute_metrics(eval_preds):
# preds, labels = eval_preds
# preds = np.argmax(preds, axis=1)
# return metric.compute(predictions=preds, references=labels)
# %%
# compute metrics
# metric = evaluate.load("accuracy")
#
#
# def compute_metrics(eval_preds):
# preds, labels = eval_preds
# preds = np.argmax(preds, axis=1)
# return metric.compute(predictions=preds, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(mdm_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(mdm_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
model = model.eval()
model = model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
pred_labels = []
actual_labels = []
pred_labels = []
actual_labels = []
BATCH_SIZE = 64
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
for batch in tqdm(dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
actual_labels.extend(batch['label'])
BATCH_SIZE = 64
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
for batch in tqdm(dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
actual_labels.extend(batch['label'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Perform inference
with torch.no_grad():
logits = model(
input_ids,
attention_mask).logits
predicted_class_ids = logits.argmax(dim=1).to("cpu")
pred_labels.extend(predicted_class_ids)
# Perform inference
with torch.no_grad():
logits = model(
input_ids,
attention_mask).logits
predicted_class_ids = logits.argmax(dim=1).to("cpu")
pred_labels.extend(predicted_class_ids)
pred_labels = [tensor.item() for tensor in pred_labels]
# %%
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
y_true = actual_labels
y_pred = pred_labels
# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
# Print the results
print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
pred_labels = [tensor.item() for tensor in pred_labels]
# %%
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
y_true = actual_labels
y_pred = pred_labels
# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
# Print the results
print(f'Accuracy: {accuracy:.5f}')
print(f'F1 Score: {f1:.5f}')
print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.5f}')
# %%
for fold in [1,2,3,4,5]:
test(fold)

View File

@ -53,13 +53,15 @@ def process_df_to_dict(df, mdm_list):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
pattern = row['pattern']
try:
index = mdm_list.index(pattern)
except ValueError:
index = -1
element = {
'text' : f"{desc}",
'text' : f"{desc}{unit}",
'label': index,
}
output_list.append(element)
@ -69,7 +71,7 @@ def process_df_to_dict(df, mdm_list):
def create_split_dataset(fold, mdm_list):
# train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid
@ -86,126 +88,124 @@ def create_split_dataset(fold, mdm_list):
# %%
# function to perform training for a given fold
# def train(fold):
fold = 1
def train(fold):
save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold, mdm_list)
save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold, mdm_list)
# prepare tokenizer
# prepare tokenizer
model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
truncation=True,
padding=True
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
truncation=True,
padding=True
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
# %% temp
# tokenized_datasets['train'].rename_columns()
# %% temp
# tokenized_datasets['train'].rename_columns()
# %% temp
tokenized_datasets['train']['input_ids']
# %%
# create data collator
# %%
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# %%
# compute metrics
metric = evaluate.load("accuracy")
# %%
# compute metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
preds = np.argmax(preds, axis=1)
return metric.compute(predictions=preds, references=labels)
def compute_metrics(eval_preds):
preds, labels = eval_preds
preds = np.argmax(preds, axis=1)
return metric.compute(predictions=preds, references=labels)
# %%
# create id2label and label2id
# %%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(mdm_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
# model = torch.compile(model, backend="inductor", dynamic=True)
# %%
# Trainer
training_args = TrainingArguments(
output_dir=f"{save_path}",
eval_strategy="epoch",
logging_dir="tensorboard-log",
logging_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
learning_rate=2e-5,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
bf16=True,
push_to_hub=False,
remove_unused_columns=False,
)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# # execute training
# for fold in [1,2,3,4,5]:
# print(fold)
# train(fold)
# %%
# create id2label and label2id
# %%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(mdm_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
# model = torch.compile(model, backend="inductor", dynamic=True)
# %%
# Trainer
training_args = TrainingArguments(
output_dir=f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-5,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=80,
bf16=True,
push_to_hub=False,
remove_unused_columns=False,
)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
for fold in [1,2,3,4,5]:
print(fold)
train(fold)
# %%

View File

@ -52,11 +52,17 @@ class Inference():
print("preparing dataloader")
# convert each dataframe row into a dictionary
# outputs a list of dictionaries
def _process_df(df):
output_list = [{
'input': f"<DESC>{row['tag_description']}<DESC>",
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
} for _, row in df.iterrows()]
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
}
output_list.append(element)
return output_list

View File

@ -1,6 +1,6 @@
Accuracy for fold 1: 0.943208707998107
Accuracy for fold 2: 0.9214953271028037
Accuracy for fold 3: 0.9728915662650602
Accuracy for fold 4: 0.967174119885823
Accuracy for fold 5: 0.9097572148419606
Accuracy for fold 1: 0.9687647893989588
Accuracy for fold 2: 0.9565420560747664
Accuracy for fold 3: 0.9708835341365462
Accuracy for fold 4: 0.9881065651760228
Accuracy for fold 5: 0.9225836005497022

View File

@ -32,8 +32,9 @@ def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}",
'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
}
output_list.append(element)
@ -43,7 +44,7 @@ def process_df_to_dict(df):
def create_split_dataset(fold):
# train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid
@ -150,11 +151,12 @@ def train(fold):
args = Seq2SeqTrainingArguments(
f"{save_path}",
eval_strategy="epoch",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
@ -162,7 +164,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=20,
num_train_epochs=40,
predict_with_generate=True,
bf16=True,
push_to_hub=False,