Feat: added abbreviation expansion rules
This commit is contained in:
parent
59bbf1f403
commit
2b5994cb52
|
@ -0,0 +1 @@
|
||||||
|
__pycache__
|
|
@ -0,0 +1,285 @@
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
from utils import Retriever, cosine_similarity_chunked
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# %%
|
||||||
|
fold = 5
|
||||||
|
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# subset to mdm
|
||||||
|
df = df[df['MDM']]
|
||||||
|
|
||||||
|
thing_condition = df['p_thing'] == df['thing_pattern']
|
||||||
|
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
|
||||||
|
|
||||||
|
property_condition = df['p_property'] == df['property_pattern']
|
||||||
|
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
|
||||||
|
|
||||||
|
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
|
||||||
|
|
||||||
|
test_df = df
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print(len(error_thing_df))
|
||||||
|
print(len(error_property_df))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# thing_df.to_html('thing_errors.html')
|
||||||
|
# property_df.to_html('property_errors.html')
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# what we need now is understand why the model is making these mispredictions
|
||||||
|
# import train data and test data
|
||||||
|
# %%
|
||||||
|
class Embedder():
|
||||||
|
input_df: pd.DataFrame
|
||||||
|
fold: int
|
||||||
|
|
||||||
|
def __init__(self, input_df):
|
||||||
|
self.input_df = input_df
|
||||||
|
|
||||||
|
|
||||||
|
def make_embedding(self, checkpoint_path):
|
||||||
|
|
||||||
|
def generate_input_list(df):
|
||||||
|
input_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
element = f"{desc}{unit}"
|
||||||
|
input_list.append(element)
|
||||||
|
return input_list
|
||||||
|
|
||||||
|
# prepare reference embed
|
||||||
|
train_data = list(generate_input_list(self.input_df))
|
||||||
|
# Define the directory and the pattern
|
||||||
|
retriever_train = Retriever(train_data, checkpoint_path)
|
||||||
|
retriever_train.make_embedding(batch_size=64)
|
||||||
|
return retriever_train.embeddings.to('cpu')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
checkpoint_directory = "../../train/classification_bert"
|
||||||
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint-*'
|
||||||
|
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||||
|
|
||||||
|
train_embedder = Embedder(input_df=train_df)
|
||||||
|
train_embeds = train_embedder.make_embedding(checkpoint_path)
|
||||||
|
|
||||||
|
test_embedder = Embedder(input_df=test_df)
|
||||||
|
test_embeds = test_embedder.make_embedding(checkpoint_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# test embeds are inputs since we are looking back at train data
|
||||||
|
cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# the following function takes in a full cos_sim_matrix
|
||||||
|
# condition_source: boolean selectors of the source embedding
|
||||||
|
# condition_target: boolean selectors of the target embedding
|
||||||
|
def find_closest(cos_sim_matrix, condition_source, condition_target):
|
||||||
|
# subset_matrix = cos_sim_matrix[condition_source]
|
||||||
|
# except we are subsetting 2D matrix (row, column)
|
||||||
|
subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
|
||||||
|
# we select top k here
|
||||||
|
# Get the indices of the top 5 maximum values along axis 1
|
||||||
|
top_k = 3
|
||||||
|
top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values
|
||||||
|
# note that top_k_indices is a nested list because of the 2d nature of the matrix
|
||||||
|
# the result is flipped
|
||||||
|
top_k_indices[0] = top_k_indices[0][::-1]
|
||||||
|
|
||||||
|
# Get the values of the top 5 maximum scores
|
||||||
|
top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
return top_k_indices, top_k_values
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# special find-back code
|
||||||
|
# %%
|
||||||
|
def find_back_element_with_print(select_idx):
|
||||||
|
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
|
||||||
|
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
|
||||||
|
|
||||||
|
top_k_indices, top_k_values = find_closest(
|
||||||
|
cos_sim_matrix=cos_sim_matrix,
|
||||||
|
condition_source=condition_source,
|
||||||
|
condition_target=condition_target)
|
||||||
|
|
||||||
|
training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
|
||||||
|
training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
|
||||||
|
|
||||||
|
test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
|
||||||
|
test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
|
||||||
|
test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
|
||||||
|
predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
|
||||||
|
predicted_test_data = predicted_test_data.to_list()[0]
|
||||||
|
|
||||||
|
print("*" * 80)
|
||||||
|
print("idx:", select_idx)
|
||||||
|
print("train desc", training_desc_list)
|
||||||
|
print("train thing+property", training_data_pattern_list)
|
||||||
|
print("test desc", test_desc_list)
|
||||||
|
print("test thing+property", test_data_pattern_list)
|
||||||
|
print("predicted thing+property", predicted_test_data)
|
||||||
|
print("ships idx", test_ship_id)
|
||||||
|
print("score:", top_k_values[0])
|
||||||
|
|
||||||
|
test_pattern = test_data_pattern_list[0]
|
||||||
|
|
||||||
|
find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
|
||||||
|
|
||||||
|
if sum(find_back_list) > 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def find_back_element(select_idx):
|
||||||
|
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
|
||||||
|
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
|
||||||
|
|
||||||
|
top_k_indices, top_k_values = find_closest(
|
||||||
|
cos_sim_matrix=cos_sim_matrix,
|
||||||
|
condition_source=condition_source,
|
||||||
|
condition_target=condition_target)
|
||||||
|
|
||||||
|
training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
|
||||||
|
|
||||||
|
test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
|
||||||
|
|
||||||
|
# print(training_data_pattern_list)
|
||||||
|
# print(test_data_pattern_list)
|
||||||
|
|
||||||
|
test_pattern = test_data_pattern_list[0]
|
||||||
|
|
||||||
|
find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
|
||||||
|
|
||||||
|
if sum(find_back_list) > 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# for error thing
|
||||||
|
pattern_in_train = []
|
||||||
|
for select_idx in error_thing_df.index:
|
||||||
|
result = find_back_element_with_print(select_idx)
|
||||||
|
print("status:", result)
|
||||||
|
pattern_in_train.append(result)
|
||||||
|
|
||||||
|
sum(pattern_in_train)/len(pattern_in_train)
|
||||||
|
|
||||||
|
###
|
||||||
|
# for error property
|
||||||
|
# %%
|
||||||
|
pattern_in_train = []
|
||||||
|
for select_idx in error_property_df.index:
|
||||||
|
result = find_back_element_with_print(select_idx)
|
||||||
|
print("status:", result)
|
||||||
|
pattern_in_train.append(result)
|
||||||
|
|
||||||
|
sum(pattern_in_train)/len(pattern_in_train)
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# make function to compute similarity of closest retrieved result
|
||||||
|
def compute_similarity(select_idx):
|
||||||
|
condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
|
||||||
|
condition_target = np.ones(train_embeds.shape[0], dtype=bool)
|
||||||
|
top_k_indices, top_k_values = find_closest(
|
||||||
|
cos_sim_matrix=cos_sim_matrix,
|
||||||
|
condition_source=condition_source,
|
||||||
|
condition_target=condition_target)
|
||||||
|
|
||||||
|
return np.mean(top_k_values[0])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def print_summary(similarity_scores):
|
||||||
|
# Convert list to numpy array for additional stats
|
||||||
|
np_array = np.array(similarity_scores)
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
mean_value = np.mean(np_array)
|
||||||
|
percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles
|
||||||
|
|
||||||
|
# Display numpy results
|
||||||
|
print("Mean:", mean_value)
|
||||||
|
print("25th, 50th, 75th Percentiles:", percentiles)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
##########################################
|
||||||
|
# Analyze the degree of similarity differences between correct and incorrect results
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute similarity scores for all values in error_thing_df
|
||||||
|
similarity_thing_scores = []
|
||||||
|
for idx in error_thing_df.index:
|
||||||
|
similarity_thing_scores.append(compute_similarity(idx))
|
||||||
|
print_summary(similarity_thing_scores)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
similarity_property_scores = []
|
||||||
|
for idx in error_property_df.index:
|
||||||
|
similarity_property_scores.append(compute_similarity(idx))
|
||||||
|
print_summary(similarity_property_scores)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
similarity_correct_scores = []
|
||||||
|
for idx in correct_df.index:
|
||||||
|
similarity_correct_scores.append(compute_similarity(idx))
|
||||||
|
print_summary(similarity_correct_scores)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# Sample data
|
||||||
|
list1 = similarity_thing_scores
|
||||||
|
list2 = similarity_property_scores
|
||||||
|
list3 = similarity_correct_scores
|
||||||
|
|
||||||
|
# Plot histograms
|
||||||
|
bins = 50
|
||||||
|
plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
|
||||||
|
plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
|
||||||
|
plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
|
||||||
|
|
||||||
|
# Labels and legend
|
||||||
|
plt.xlabel('Value')
|
||||||
|
plt.ylabel('Frequency')
|
||||||
|
plt.legend(loc='upper right')
|
||||||
|
plt.title('Histograms of Three Lists')
|
||||||
|
|
||||||
|
# Show plot
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,71 @@
|
||||||
|
import torch
|
||||||
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
DataCollatorWithPadding,
|
||||||
|
)
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Retriever:
|
||||||
|
def __init__(self, input_texts, model_checkpoint):
|
||||||
|
# we need to generate the embedding from list of input strings
|
||||||
|
self.embeddings = []
|
||||||
|
self.inputs = input_texts
|
||||||
|
model_checkpoint = model_checkpoint
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
||||||
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
# device = "cpu"
|
||||||
|
model.to(self.device)
|
||||||
|
self.model = model.eval()
|
||||||
|
|
||||||
|
|
||||||
|
def make_embedding(self, batch_size=64):
|
||||||
|
all_embeddings = self.embeddings
|
||||||
|
input_texts = self.inputs
|
||||||
|
|
||||||
|
for i in range(0, len(input_texts), batch_size):
|
||||||
|
batch_texts = input_texts[i:i+batch_size]
|
||||||
|
# Tokenize the input text
|
||||||
|
inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
|
||||||
|
input_ids = inputs.input_ids.to(self.device)
|
||||||
|
attention_mask = inputs.attention_mask.to(self.device)
|
||||||
|
|
||||||
|
|
||||||
|
# Pass the input through the encoder and retrieve the embeddings
|
||||||
|
with torch.no_grad():
|
||||||
|
encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
|
||||||
|
# get last layer
|
||||||
|
embeddings = encoder_outputs.hidden_states[-1]
|
||||||
|
# get cls token embedding
|
||||||
|
cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size)
|
||||||
|
all_embeddings.append(cls_embeddings)
|
||||||
|
|
||||||
|
# remove the batch list and makes a single large tensor, dim=0 increases row-wise
|
||||||
|
all_embeddings = torch.cat(all_embeddings, dim=0)
|
||||||
|
|
||||||
|
self.embeddings = all_embeddings
|
||||||
|
|
||||||
|
def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
|
||||||
|
batch1_size = batch1.size(0)
|
||||||
|
batch2_size = batch2.size(0)
|
||||||
|
|
||||||
|
# Prepare an empty tensor to store results
|
||||||
|
cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
|
||||||
|
|
||||||
|
# Process batch1 in chunks
|
||||||
|
for i in range(0, batch1_size, chunk_size):
|
||||||
|
batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1
|
||||||
|
|
||||||
|
# Expand batch1 chunk and entire batch2 for comparison
|
||||||
|
batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len)
|
||||||
|
batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len)
|
||||||
|
|
||||||
|
# Compute cosine similarity for the chunk and store it in the final tensor
|
||||||
|
cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
|
||||||
|
|
||||||
|
return cos_sim
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
# we want to compare the labels between the train data and test data
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
#########################
|
||||||
|
# experiment 1
|
||||||
|
|
||||||
|
#############
|
||||||
|
# 1 import test data
|
||||||
|
# %%
|
||||||
|
fold = 1
|
||||||
|
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# subset to mdm
|
||||||
|
df = df[df['MDM']]
|
||||||
|
|
||||||
|
thing_condition = df['p_thing'] == df['thing_pattern']
|
||||||
|
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
|
||||||
|
|
||||||
|
property_condition = df['p_property'] == df['property_pattern']
|
||||||
|
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
|
||||||
|
|
||||||
|
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
|
||||||
|
|
||||||
|
test_df = df
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# 2 import train data
|
||||||
|
# %%
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
|
train_df = pd.read_csv(data_path)
|
||||||
|
|
||||||
|
train_pattern = train_df['pattern']
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test_pattern_set = set(test_pattern)
|
||||||
|
train_pattern_set = set(train_pattern)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# use this to get labels in test not found in training data
|
||||||
|
test_pattern_set - train_pattern_set
|
||||||
|
|
||||||
|
# verdict: we see that FOMassFlowTotal is not found in the training set
|
||||||
|
# hence it is not possible for this to be classified correctly
|
||||||
|
|
||||||
|
|
||||||
|
###################################
|
||||||
|
# experiment 2
|
||||||
|
# %%
|
||||||
|
# we want to check load and loadpercent
|
||||||
|
test_df[test_df['property_pattern'] == 'Load']
|
||||||
|
# %%
|
||||||
|
test_df[test_df['property_pattern'] == 'LoadPercent']
|
||||||
|
|
||||||
|
# verdict: we see that the units column determine what this should be
|
||||||
|
# in order to not disturb the model, we should chuck it in to post-process
|
|
@ -0,0 +1,77 @@
|
||||||
|
# we want to compare the labels between the train data and test data
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# %%
|
||||||
|
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||||||
|
df = pd.read_csv(file_path)
|
||||||
|
df = df[df['MDM']]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
unit_list = df['unit']
|
||||||
|
unit_list = [elem if (isinstance(elem, str)) else '' for elem in unit_list]
|
||||||
|
print(sorted(list(set(unit_list))))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test = '℃'
|
||||||
|
# df[df['unit'] == test]['property_pattern'].to_list()
|
||||||
|
df[df['unit'] == test]
|
||||||
|
|
||||||
|
#############
|
||||||
|
# 1 import test data
|
||||||
|
# %%
|
||||||
|
fold = 1
|
||||||
|
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
|
||||||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# subset to mdm
|
||||||
|
df = df[df['MDM']]
|
||||||
|
|
||||||
|
thing_condition = df['p_thing'] == df['thing_pattern']
|
||||||
|
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
|
||||||
|
|
||||||
|
property_condition = df['p_property'] == df['property_pattern']
|
||||||
|
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
|
||||||
|
|
||||||
|
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
|
||||||
|
|
||||||
|
test_df = df
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# 2 import train data
|
||||||
|
# %%
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
|
train_df = pd.read_csv(data_path)
|
||||||
|
|
||||||
|
train_pattern = train_df['pattern']
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test_pattern_set = set(test_pattern)
|
||||||
|
train_pattern_set = set(train_pattern)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# use this to get labels in test not found in training data
|
||||||
|
test_pattern_set - train_pattern_set
|
||||||
|
|
||||||
|
# verdict: we see that FOMassFlowTotal is not found in the training set
|
||||||
|
# hence it is not possible for this to be classified correctly
|
||||||
|
|
||||||
|
|
||||||
|
###################################
|
||||||
|
# experiment 2
|
||||||
|
# %%
|
||||||
|
# we want to check load and loadpercent
|
||||||
|
test_df[test_df['property_pattern'] == 'Load']
|
||||||
|
# %%
|
||||||
|
test_df[test_df['property_pattern'] == 'LoadPercent']
|
||||||
|
|
||||||
|
#
|
||||||
|
set(df['unit'])
|
||||||
|
|
||||||
|
# %%
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
Perform substitutions on common terms to standardize abbreviations.
|
Perform substitutions on common terms to standardize abbreviations.
|
||||||
|
|
||||||
|
- abbreviations_replacer.py: replaces abbreviations with full terms
|
||||||
|
|
||||||
## Instructions:
|
## Instructions:
|
||||||
|
|
||||||
- `python abbreviations_replacer.py`
|
- `python abbreviations_replacer.py`
|
|
@ -5,7 +5,7 @@ Modified by: Richard Wong
|
||||||
# %%
|
# %%
|
||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from replacement_dict import replacement_dict
|
from replacement_dict import desc_replacement_dict, unit_replacement_dict
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
|
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
|
||||||
|
@ -24,6 +24,22 @@ def replace_abbreviations(tag_descriptions, abbreviations):
|
||||||
replaced_descriptions.append(description)
|
replaced_descriptions.append(description)
|
||||||
return replaced_descriptions
|
return replaced_descriptions
|
||||||
|
|
||||||
|
def cleanup_spaces(tag_descriptions):
|
||||||
|
# Replace all whitespace with a single space
|
||||||
|
replaced_descriptions = []
|
||||||
|
for description in tag_descriptions:
|
||||||
|
description_clean = re.sub(r'\s+', ' ', description)
|
||||||
|
replaced_descriptions.append(description_clean)
|
||||||
|
return replaced_descriptions
|
||||||
|
|
||||||
|
# remove all dots
|
||||||
|
def cleanup_dots(tag_descriptions):
|
||||||
|
replaced_descriptions = []
|
||||||
|
for description in tag_descriptions:
|
||||||
|
description_clean = re.sub(r'\.', '', description)
|
||||||
|
replaced_descriptions.append(description_clean)
|
||||||
|
return replaced_descriptions
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||||||
|
@ -31,16 +47,32 @@ df = pd.read_csv(file_path)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Replace abbreviations
|
# Replace abbreviations
|
||||||
print("running substitution")
|
print("running substitution for descriptions")
|
||||||
df['tag_description']= df['tag_description'].fillna("NOVALUE")
|
df['tag_description']= df['tag_description'].fillna("NOVALUE")
|
||||||
# Replace whitespace-only entries with "NOVALUE"
|
# Replace whitespace-only entries with "NOVALUE"
|
||||||
# note that "N/A" can be read as nan
|
# note that "N/A" can be read as nan
|
||||||
|
# replace whitespace only values as NOVALUE
|
||||||
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
|
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
|
||||||
tag_descriptions = df['tag_description']
|
tag_descriptions = df['tag_description']
|
||||||
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict)
|
replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
|
||||||
|
replaced_descriptions = cleanup_spaces(replaced_descriptions)
|
||||||
|
replaced_descriptions = cleanup_dots(replaced_descriptions)
|
||||||
|
df["tag_description"] = replaced_descriptions
|
||||||
# print("Descriptions after replacement:", replaced_descriptions)
|
# print("Descriptions after replacement:", replaced_descriptions)
|
||||||
|
# strip trailing whitespace
|
||||||
|
df['tag_description'] = df['tag_description'].str.rstrip()
|
||||||
|
df['tag_description'] = df['tag_description'].str.upper()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df["tag_description"] = replaced_descriptions
|
print("running substitutions for units")
|
||||||
|
df['unit'] = df['unit'].fillna("NOVALUE")
|
||||||
|
df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
|
||||||
|
unit_list = df['unit']
|
||||||
|
new_unit = replace_abbreviations(unit_list, unit_replacement_dict)
|
||||||
|
new_unit = cleanup_spaces(new_unit)
|
||||||
|
df['unit'] = new_unit
|
||||||
|
|
||||||
|
|
||||||
|
# save
|
||||||
df.to_csv("../exports/preprocessed_data.csv", index=False)
|
df.to_csv("../exports/preprocessed_data.csv", index=False)
|
||||||
print("file saved")
|
print("file saved")
|
||||||
|
|
|
@ -1,113 +1,210 @@
|
||||||
"""
|
# substitution mapping for descriptions
|
||||||
Author: Daniel Kim
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Abbreviations and their replacements
|
# Abbreviations and their replacements
|
||||||
replacement_dict = {
|
desc_replacement_dict = {
|
||||||
r'\bLIST\b': 'LIST',
|
r'\bLIST\b\b': 'LIST',
|
||||||
r'\bList\b': 'LIST',
|
r'\bList\b\b': 'LIST',
|
||||||
r'\bEXH\.': 'EXHAUST',
|
r'\bEXH\.\b': 'EXHAUST',
|
||||||
r'\bEXH\b': 'EXHAUST',
|
r'\bEXH\b\b': 'EXHAUST',
|
||||||
r'\bEXHAUST\.': 'EXHAUST',
|
r'\bEXHAUST\.\b': 'EXHAUST',
|
||||||
r'\bExhaust\b': 'EXHAUST',
|
r'\bExhaust\b\b': 'EXHAUST',
|
||||||
r'\bEXHAUST\b': 'EXHAUST',
|
r'\bEXHAUST\b\b': 'EXHAUST',
|
||||||
r'\bTEMP\.': 'TEMPERATURE',
|
r'\bTEMP\.\b': 'TEMPERATURE',
|
||||||
r'\bTEMP\b': 'TEMPERATURE',
|
r'\bTEMP\b\b': 'TEMPERATURE',
|
||||||
r'\bTEMPERATURE\.': 'TEMPERATURE',
|
r'\bTEMPERATURE\.\b': 'TEMPERATURE',
|
||||||
r'\bTEMPERATURE\b': 'TEMPERATURE',
|
r'\bTEMPERATURE\b\b': 'TEMPERATURE',
|
||||||
r'\bW\.': 'WATER',
|
r'\bW\.\b': 'WATER',
|
||||||
r'\bWATER\b': 'WATER',
|
r'\bWATER\b\b': 'WATER',
|
||||||
r'\bCYL\.': 'CYLINDER',
|
r'\bCW\b\b': 'COOLING WATER',
|
||||||
r'\bcyl\.': 'CYLINDER',
|
r'\bCYL\.\b': 'CYLINDER',
|
||||||
r'\bCYL\b': 'CYLINDER',
|
r'\bCyl\b\b': 'CYLINDER',
|
||||||
r'\bcylinder\b': 'CYLINDER',
|
r'\bcyl\.\b': 'CYLINDER',
|
||||||
r'\bCYLINDER\b': 'CYLINDER',
|
r'\bCYL\b\b': 'CYLINDER',
|
||||||
r'\bCOOL\.': 'COOLING',
|
r'\bCYL(?=\d|\W|$)\b': 'CYLINDER',
|
||||||
r'\bcool\.': 'COOLING',
|
r'\bcylinder\b\b': 'CYLINDER',
|
||||||
r'\bcooling\b': 'COOLING',
|
r'\bCYLINDER\b\b': 'CYLINDER',
|
||||||
r'\bCOOLING\b': 'COOLING',
|
r'\bCOOL\.\b': 'COOLING',
|
||||||
r'\bcooler\b': 'COOLER',
|
r'\bcool\.\b': 'COOLING',
|
||||||
r'\bCOOLER\b': 'COOLER',
|
r'\bcooling\b\b': 'COOLING',
|
||||||
r'\bScav\.': 'SCAVENGE',
|
r'\bCOOLING\b\b': 'COOLING',
|
||||||
r'\bSCAV\.': 'SCAVENGE',
|
r'\bcooler\b\b': 'COOLER',
|
||||||
r'\bINL\.': 'INLET',
|
r'\bCOOLER\b\b': 'COOLER',
|
||||||
r'\binlet\b': 'INLET',
|
r'\bScav\.\b': 'SCAVENGE',
|
||||||
r'\bINLET\b': 'INLET',
|
r'\bSCAV\.\b': 'SCAVENGE',
|
||||||
r'\bOUT\.': 'OUTLET',
|
r'\bINL\.\b': 'INLET',
|
||||||
r'\bOUTL\.': 'OUTLET',
|
r'\binlet\b\b': 'INLET',
|
||||||
r'\boutlet\b': 'OUTLET',
|
r'\bINLET\b\b': 'INLET',
|
||||||
r'\bOUTLET\b': 'OUTLET',
|
r'\bOUT\.\b': 'OUTLET',
|
||||||
r'\bPRESS\.': 'PRESSURE',
|
r'\bOUTL\.\b': 'OUTLET',
|
||||||
r'\bPress\.': 'PRESSURE',
|
r'\boutlet\b\b': 'OUTLET',
|
||||||
r'\bpressure\b': 'PRESSURE',
|
r'\bOUTLET\b\b': 'OUTLET',
|
||||||
r'\bPRESSURE\b': 'PRESSURE',
|
# pressure
|
||||||
r'\bCLR\b': 'CLEAR',
|
r'\bPRESS\b\b': 'PRESSURE',
|
||||||
r'\bENG\.': 'ENGINE',
|
r'\bPRESS\.\b': 'PRESSURE',
|
||||||
r'\bENG\b': 'ENGINE',
|
r'\bPress\.\b': 'PRESSURE',
|
||||||
r'\bENGINE\b': 'ENGINE',
|
r'\bpressure\b\b': 'PRESSURE',
|
||||||
r'\bEngine speed\b': 'ENGINE SPEED',
|
r'\bPRESSURE\b\b': 'PRESSURE',
|
||||||
r'\bEngine running\b': 'ENGINE RUNNING',
|
# this is a special replacement - it is safe to replace PRS w/o checks
|
||||||
r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP',
|
r'PRS\b': 'PRESSURE',
|
||||||
r'\bEngine room\b': 'ENGINE ROOM',
|
r'\bCLR\b\b': 'CLEAR',
|
||||||
|
r'\bENG\.\b': 'ENGINE',
|
||||||
|
r'\bENG\b\b': 'ENGINE',
|
||||||
|
r'\bENGINE\b\b': 'ENGINE',
|
||||||
|
r'\bEngine speed\b\b': 'ENGINE SPEED',
|
||||||
|
r'\bEngine running\b\b': 'ENGINE RUNNING',
|
||||||
|
r'\bEngine RPM pickup\b\b': 'ENGINE RPM PICKUP',
|
||||||
|
r'\bEngine room\b\b': 'ENGINE ROOM',
|
||||||
|
# main engine
|
||||||
r'\bM/E\b': 'MAIN_ENGINE',
|
r'\bM/E\b': 'MAIN_ENGINE',
|
||||||
r'\bME\b': 'MAIN_ENGINE',
|
r'\bM_E\b': 'MAIN_ENGINE',
|
||||||
r'\bMAIN ENGINE\b': 'MAIN_ENGINE',
|
r'\bME(?=\d|\W|$)\b': 'MAIN_ENGINE',
|
||||||
r'\bGen\b': 'GENERATOR_ENGINE',
|
r'\bMAIN ENGINE\b\b': 'MAIN_ENGINE',
|
||||||
r'\bGE\b': 'GENERATOR_ENGINE',
|
r'\bGen\b\b': 'GENERATOR_ENGINE',
|
||||||
|
# ensure that we substitute only for terms where following GE is num or special
|
||||||
|
r'\bGE(?=\d|\W|$)\b': 'GENERATOR_ENGINE',
|
||||||
r'\bG/E\b': 'GENERATOR_ENGINE',
|
r'\bG/E\b': 'GENERATOR_ENGINE',
|
||||||
r'\bDG': 'GENERATOR_ENGINE',
|
r'\bG_E\b': 'GENERATOR_ENGINE',
|
||||||
r'\bD/G\b': 'GENERATOR_ENGINE',
|
r'\bDG\b': 'GENERATOR_ENGINE',
|
||||||
r'\bGEN\.': 'GENERATOR_ENGINE',
|
r'\bD/G\b\b': 'GENERATOR_ENGINE',
|
||||||
r'\bGENERATOR ENGINE\B': 'GENERATOR_ENGINE',
|
r'\bGEN\.\b': 'GENERATOR_ENGINE',
|
||||||
r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
|
r'\bGENERATOR ENGINE\B\b': 'GENERATOR_ENGINE',
|
||||||
r'\bENGINE ROOM\b': 'ENGINE ROOM',
|
r'\b(\d+)MGE\b\b': r'NO\1 GENERATOR_ENGINE',
|
||||||
r'\bE/R\b': 'ENGINE ROOM',
|
r'\bGEN\.WIND\.TEMP\b\b': 'GENERATOR WINDING TEMPERATURE',
|
||||||
r'\bNO1\b': 'NO.1',
|
r'\bENGINE ROOM\b\b': 'ENGINE ROOM',
|
||||||
r'\bNO\.1\b': 'NO.1',
|
r'\bE/R\b\b': 'ENGINE ROOM',
|
||||||
r'\bNo\.1\b': 'NO.1',
|
r'\bFLTR\b\b': 'FILTER',
|
||||||
r'\bNO2\b': 'NO.2',
|
# marine gas oil
|
||||||
r'\bNO\.2\b': 'NO.2',
|
r'\bM\.G\.O\b\b': 'MARINE GAS OIL',
|
||||||
r'\bNo\.2\b': 'NO.2',
|
r'\bMGO\b\b': 'MARINE GAS OIL',
|
||||||
r'\bNO3\b': 'NO.3',
|
r'\bMDO\b\b': 'MARINE DIESEL OIL',
|
||||||
r'\bNO\.3\b': 'NO.3',
|
# light fuel oil
|
||||||
r'\bNo\.3\b': 'NO.3',
|
r'\bL\.F\.O\b\b': 'LIGHT FUEL OIL',
|
||||||
r'\bNO4\b': 'NO.4',
|
r'\bLFO\b\b': 'LIGHT FUEL OIL',
|
||||||
r'\bNO\.4\b': 'NO.4',
|
# heavy fuel oil
|
||||||
r'\bNo\.4\b': 'NO.4',
|
r'\bHFO\b\b': 'HEAVY FUEL OIL',
|
||||||
r'\bNO5\b': 'NO.5',
|
r'\bH\.F\.O\b\b': 'HEAVY FUEL OIL',
|
||||||
r'\bNO\.5\b': 'NO.5',
|
# for remaining fuel oil that couldn't be substituted
|
||||||
r'\bNo\.5\b': 'NO.5',
|
r'\bF\.O\b\b': 'FUEL OIL',
|
||||||
r'\bFLTR\b': 'FILTER',
|
r'\bFO\b\b': 'FUEL OIL',
|
||||||
r'\bLUB\.': 'LUBRICANT',
|
# lubricant
|
||||||
r'\bM\.G\.O\b': 'MGO',
|
r'\bLUB\.\b': 'LUBRICANT',
|
||||||
r'\bMGO\b': 'MGO',
|
# lubricating oil
|
||||||
r'\bF\.O\b': 'FUEL OIL',
|
r'\bL\.O\b\b': 'LUBRICATING OIL',
|
||||||
r'\bFO\b': 'FUEL OIL',
|
r'\bLO\b\b': 'LUBRICATING OIL',
|
||||||
r'\bL\.T\b': 'LOW TEMPERATURE',
|
# lubricating oil pressure
|
||||||
r'\bLT\b': 'LOW TEMPERATURE',
|
r'\bLO_PRESS\b\b': 'LUBRICATING OIL PRESSURE',
|
||||||
r'\bH\.T\b': 'HIGH TEMPERATURE',
|
r'\bLO_PRESSURE\b\b': 'LUBRICATING OIL PRESSURE',
|
||||||
r'\bHT\b': 'HIGH TEMPERATURE',
|
# temperature
|
||||||
r'\bAUX\.': 'AUXILIARY',
|
r'\bL\.T\b\b': 'LOW TEMPERATURE',
|
||||||
r'\bNO\.2A\b': 'NO.2A',
|
r'\bLT\b\b': 'LOW TEMPERATURE',
|
||||||
r'\bNO\.2B\b': 'NO.2B',
|
r'\bH\.T\b\b': 'HIGH TEMPERATURE',
|
||||||
r'\bAUX\.BOILER\b': 'AUXILIARY BOILER',
|
r'\bHT\b\b': 'HIGH TEMPERATURE',
|
||||||
r'\bAUX\. BOILER\b': 'AUXILIARY BOILER',
|
# auxiliary boiler
|
||||||
r'\bWIND\.': 'WINDING',
|
# replace these first before replacing AUXILIARY only
|
||||||
r'\bWINDING\b': 'WINDING',
|
r'\bAUX\.BOILER\b\b': 'AUXILIARY BOILER',
|
||||||
r'\bC\.S\.W\b': 'CSW',
|
r'\bAUX\. BOILER\b\b': 'AUXILIARY BOILER',
|
||||||
r'\bCSW\b': 'CSW',
|
r'\bAUX BLR\b\b': 'AUXILIARY BOILER',
|
||||||
r'\bVLOT\.': 'VOLTAGE',
|
r'\bAUX\.\b': 'AUXILIARY',
|
||||||
r'\bVOLTAGE\b': 'VOLTAGE',
|
# composite boiler
|
||||||
r'\bVOLT\.': 'VOLTAGE',
|
r'\bCOMP\. BOILER\b\b': 'COMPOSITE BOILER',
|
||||||
r'\bFREQ\.': 'FREQUENCY',
|
r'\bCOMP BOILER\b\b': 'COMPOSITE BOILER',
|
||||||
r'\bFREQUENCY\b': 'FREQUENCY',
|
r'\bWIND\.\b': 'WINDING',
|
||||||
r'\bCURR\.': 'CURRENT',
|
r'\bWINDING\b\b': 'WINDING',
|
||||||
r'\bCURRENT\b': 'CURRENT',
|
r'\bC\.S\.W\b\b': 'CSW',
|
||||||
r'\bH\.F\.O\.': 'HFO',
|
r'\bCSW\b\b': 'CSW',
|
||||||
r'\bTCA\b': 'TURBOCHARGER',
|
r'\bVLOT\.\b': 'VOLTAGE',
|
||||||
r'\bTCB\b': 'TURBOCHARGER',
|
r'\bVOLTAGE\b\b': 'VOLTAGE',
|
||||||
|
r'\bVOLT\.\b': 'VOLTAGE',
|
||||||
|
r'\bFREQ\.\b': 'FREQUENCY',
|
||||||
|
r'\bFREQUENCY\b\b': 'FREQUENCY',
|
||||||
|
r'\bCURR\.\b': 'CURRENT',
|
||||||
|
r'\bCURRENT\b\b': 'CURRENT',
|
||||||
|
r'\bTCA\b\b': 'TURBOCHARGER',
|
||||||
|
r'\bTCB\b\b': 'TURBOCHARGER',
|
||||||
r'\bT/C\b': 'TURBOCHARGER',
|
r'\bT/C\b': 'TURBOCHARGER',
|
||||||
r'\bTC\b': 'TURBOCHARGER',
|
r'\bT_C\b': 'TURBOCHARGER',
|
||||||
r'\bTURBOCHAGER\b': 'TURBOCHARGER',
|
r'\bTC(?=\d|\W|$)\b': 'TURBOCHARGER',
|
||||||
r'\bTURBOCHARGER\b': 'TURBOCHARGER'
|
r'\bTURBOCHAGER\b\b': 'TURBOCHARGER',
|
||||||
|
r'\bTURBOCHARGER\b\b': 'TURBOCHARGER',
|
||||||
|
# misc spelling errors
|
||||||
|
r'\bOPERATOIN\b': 'OPERATION',
|
||||||
|
# additional standardizing replacement
|
||||||
|
# replace # followed by a number with NO
|
||||||
|
r'#(?=\d)\b': 'NO',
|
||||||
|
r'\bNO\.(?=\d)\b': 'NO',
|
||||||
|
# yes, there was one with two dots - what the hell?
|
||||||
|
r'\bNO\.\.(?=\d)\b': 'NO',
|
||||||
|
r'\bNo\.(?=\d)\b': 'NO',
|
||||||
|
}
|
||||||
|
|
||||||
|
# substitution mapping for units
|
||||||
|
# Abbreviations and their replacements
|
||||||
|
unit_replacement_dict = {
|
||||||
|
r'\b%\b': 'PERCENT',
|
||||||
|
r'\b-\b': '',
|
||||||
|
r'\b- \b': '',
|
||||||
|
# ensure no character after A
|
||||||
|
r'\bA(?!\w|/)': 'CURRENT',
|
||||||
|
r'\bAmp(?!\w|/)': 'CURRENT',
|
||||||
|
r'\bHz\b': 'HERTZ',
|
||||||
|
r'\bKG/CM2\b': 'PRESSURE',
|
||||||
|
r'\bKG/H\b': 'KILOGRAM PER HOUR',
|
||||||
|
r'\bKNm\b': 'RPM',
|
||||||
|
r'\bKW\b': 'POWER',
|
||||||
|
r'\bKg(?!\w|/)': 'MASS',
|
||||||
|
r'\bKw\b': 'POWER',
|
||||||
|
r'\bL(?!\w|/)': 'VOLUME',
|
||||||
|
r'\bMT/h\b': 'METRIC TONNES PER HOUR',
|
||||||
|
r'\bMpa\b': 'PRESSURE',
|
||||||
|
r'\bPF\b': 'POWER FACTOR',
|
||||||
|
r'\bRPM\b': 'RPM',
|
||||||
|
r'\bV(?!\w|/)': 'VOLTAGE',
|
||||||
|
r'\bbar(?!\w|/)': 'PRESSURE',
|
||||||
|
r'\bbarA\b': 'SCAVENGE PRESSURE',
|
||||||
|
r'\bcST\b': 'VISCOSITY',
|
||||||
|
r'\bcSt\b': 'VISCOSITY',
|
||||||
|
r'\bcst\b': 'VISCOSITY',
|
||||||
|
r'\bdeg(?!\w|/|\.)': 'DEGREE',
|
||||||
|
r'\bdeg.C\b': 'TEMPERATURE',
|
||||||
|
r'\bdegC\b': 'TEMPERATURE',
|
||||||
|
r'\bdegree\b': 'DEGREE',
|
||||||
|
r'\bdegreeC\b': 'TEMPERATURE',
|
||||||
|
r'\bhPa\b': 'PRESSURE',
|
||||||
|
r'\bhours\b': 'HOURS',
|
||||||
|
r'\bkN\b': 'THRUST',
|
||||||
|
r'\bkNm\b': 'TORQUE',
|
||||||
|
r'\bkW\b': 'POWER',
|
||||||
|
# ensure that kg is not followed by anything
|
||||||
|
r'\bkg(?!\w|/)': 'FLOW', # somehow in the data its flow
|
||||||
|
r'\bkg/P\b': 'MASS FLOW',
|
||||||
|
r'\bkg/cm2\b': 'PRESSURE',
|
||||||
|
r'\bkg/cm²\b': 'PRESSURE',
|
||||||
|
r'\bkg/h\b': 'MASS FLOW',
|
||||||
|
r'\bkg/hr\b': 'MASS FLOW',
|
||||||
|
r'\bkg/pulse\b': '',
|
||||||
|
r'\bkgf/cm2\b': 'PRESSURE',
|
||||||
|
r'\bkgf/cm²\b': 'PRESSURE',
|
||||||
|
r'\bkgf/㎠\b': 'PRESSURE',
|
||||||
|
r'\bknots\b': 'SPEED',
|
||||||
|
r'\bkw\b': 'POWER',
|
||||||
|
r'\bl/Hr\b': 'VOLUME FLOW',
|
||||||
|
r'\bl/h\b': 'VOLUME FLOW',
|
||||||
|
r'\bl_Hr\b': 'VOLUME FLOW',
|
||||||
|
r'\bl_hr\b': 'VOLUME FLOW',
|
||||||
|
r'\bM\b': 'DRAFT', # for wind draft
|
||||||
|
r'm': 'm', # wind draft and trim - not useful
|
||||||
|
r'\bm/s\b': 'SPEED',
|
||||||
|
r'\bm3\b': 'VOLUME',
|
||||||
|
r'\bmH2O\b': 'DRAFT',
|
||||||
|
r'\bmWC\b': 'DRAFT',
|
||||||
|
r'\bmbar\b': 'PRESSURE',
|
||||||
|
r'\bmg\b': 'ACCELERATION',
|
||||||
|
r'\bmin-¹\b': '', # data too varied
|
||||||
|
r'\bmm\b': '', # data too varied
|
||||||
|
r'\bmmH2O\b': 'WATER DRUM LEVEL',
|
||||||
|
r'\brev\b': 'RPM',
|
||||||
|
r'\brpm\b': 'RPM',
|
||||||
|
r'\bx1000min-¹\b': '',
|
||||||
|
r'\b°C\b': 'TEMPERATURE',
|
||||||
|
r'\bºC\b': 'TEMPERATURE',
|
||||||
|
r'\b℃\b': 'TEMPERATURE'
|
||||||
}
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Get the current working directory
|
||||||
|
current_path = os.getcwd()
|
||||||
|
print(current_path)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||||||
|
old_df = pd.read_csv(file_path)
|
||||||
|
new_df = pd.read_csv('../exports/preprocessed_data.csv')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compare changed rows
|
||||||
|
cond = old_df['tag_description'] != new_df['tag_description']
|
||||||
|
|
||||||
|
val1 = old_df[cond]['tag_description']
|
||||||
|
val2 = new_df[cond]['tag_description']
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'column1': val1,
|
||||||
|
'column2': val2
|
||||||
|
})
|
||||||
|
|
||||||
|
df.to_csv('desc.csv')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compare changed rows
|
||||||
|
cond = old_df['unit'] != new_df['unit']
|
||||||
|
|
||||||
|
val1 = old_df[cond]['unit']
|
||||||
|
val2 = new_df[cond]['unit']
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'column1': val1,
|
||||||
|
'column2': val2
|
||||||
|
})
|
||||||
|
|
||||||
|
df.to_csv('unit.csv')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
set(val2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
desc_set = list(set(df[df['MDM']]['tag_description']))
|
||||||
|
with open('output.txt', 'w') as file:
|
||||||
|
print(desc_set, file=file)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test = 'kg/cm3'
|
||||||
|
print(re.sub(r'kg(?!\w|/)', 'flow', test))
|
||||||
|
# %%
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -16,9 +16,6 @@ from transformers import (
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
AutoModelForSequenceClassification,
|
AutoModelForSequenceClassification,
|
||||||
DataCollatorWithPadding,
|
DataCollatorWithPadding,
|
||||||
Trainer,
|
|
||||||
EarlyStoppingCallback,
|
|
||||||
TrainingArguments
|
|
||||||
)
|
)
|
||||||
import evaluate
|
import evaluate
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -56,13 +53,15 @@ def process_df_to_dict(df, mdm_list):
|
||||||
output_list = []
|
output_list = []
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
desc = f"<DESC>{row['tag_description']}<DESC>"
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
|
||||||
pattern = row['pattern']
|
pattern = row['pattern']
|
||||||
try:
|
try:
|
||||||
index = mdm_list.index(pattern)
|
index = mdm_list.index(pattern)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
index = -1
|
index = -1
|
||||||
element = {
|
element = {
|
||||||
'text' : f"{desc}",
|
'text' : f"{desc}{unit}",
|
||||||
'label': index,
|
'label': index,
|
||||||
}
|
}
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
@ -84,145 +83,146 @@ def create_dataset(fold, mdm_list):
|
||||||
# %%
|
# %%
|
||||||
|
|
||||||
# function to perform training for a given fold
|
# function to perform training for a given fold
|
||||||
# def train(fold):
|
def test(fold):
|
||||||
fold = 1
|
|
||||||
|
|
||||||
test_dataset = create_dataset(fold, mdm_list)
|
test_dataset = create_dataset(fold, mdm_list)
|
||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
|
|
||||||
checkpoint_directory = f'../checkpoint_fold_{fold}'
|
checkpoint_directory = f'../checkpoint_fold_{fold}'
|
||||||
# Use glob to find matching paths
|
# Use glob to find matching paths
|
||||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
# we are guaranteed to save only 1 checkpoint from training
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
pattern = 'checkpoint-*'
|
pattern = 'checkpoint-*'
|
||||||
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
# Define additional special tokens
|
# Define additional special tokens
|
||||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
# Add the additional special tokens to the tokenizer
|
# Add the additional special tokens to the tokenizer
|
||||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# compute max token length
|
# compute max token length
|
||||||
max_length = 0
|
max_length = 0
|
||||||
for sample in test_dataset['text']:
|
for sample in test_dataset['text']:
|
||||||
# Tokenize the sample and get the length
|
# Tokenize the sample and get the length
|
||||||
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
|
||||||
length = len(input_ids)
|
length = len(input_ids)
|
||||||
|
|
||||||
# Update max_length if this sample is longer
|
|
||||||
if length > max_length:
|
|
||||||
max_length = length
|
|
||||||
|
|
||||||
print(max_length)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
|
|
||||||
max_length = 64
|
|
||||||
|
|
||||||
# given a dataset entry, run it through the tokenizer
|
|
||||||
def preprocess_function(example):
|
|
||||||
input = example['text']
|
|
||||||
# text_target sets the corresponding label to inputs
|
|
||||||
# there is no need to create a separate 'labels'
|
|
||||||
model_inputs = tokenizer(
|
|
||||||
input,
|
|
||||||
max_length=max_length,
|
|
||||||
# truncation=True,
|
|
||||||
padding='max_length'
|
|
||||||
)
|
|
||||||
return model_inputs
|
|
||||||
|
|
||||||
# map maps function to each "row" in the dataset
|
|
||||||
# aka the data in the immediate nesting
|
|
||||||
datasets = test_dataset.map(
|
|
||||||
preprocess_function,
|
|
||||||
batched=True,
|
|
||||||
num_proc=8,
|
|
||||||
remove_columns="text",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
|
||||||
|
|
||||||
# %% temp
|
|
||||||
# tokenized_datasets['train'].rename_columns()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# create data collator
|
|
||||||
|
|
||||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# compute metrics
|
|
||||||
# metric = evaluate.load("accuracy")
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def compute_metrics(eval_preds):
|
|
||||||
# preds, labels = eval_preds
|
|
||||||
# preds = np.argmax(preds, axis=1)
|
|
||||||
# return metric.compute(predictions=preds, references=labels)
|
|
||||||
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(
|
|
||||||
model_checkpoint,
|
|
||||||
num_labels=len(mdm_list),
|
|
||||||
id2label=id2label,
|
|
||||||
label2id=label2id)
|
|
||||||
# important! after extending tokens vocab
|
|
||||||
model.resize_token_embeddings(len(tokenizer))
|
|
||||||
|
|
||||||
model = model.eval()
|
|
||||||
|
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
||||||
model.to(device)
|
|
||||||
|
|
||||||
pred_labels = []
|
|
||||||
actual_labels = []
|
|
||||||
|
|
||||||
|
|
||||||
BATCH_SIZE = 64
|
|
||||||
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
|
||||||
for batch in tqdm(dataloader):
|
|
||||||
# Inference in batches
|
|
||||||
input_ids = batch['input_ids']
|
|
||||||
attention_mask = batch['attention_mask']
|
|
||||||
# save labels too
|
|
||||||
actual_labels.extend(batch['label'])
|
|
||||||
|
|
||||||
|
# Update max_length if this sample is longer
|
||||||
|
if length > max_length:
|
||||||
|
max_length = length
|
||||||
|
|
||||||
# Move to GPU if available
|
print(max_length)
|
||||||
input_ids = input_ids.to(device)
|
|
||||||
attention_mask = attention_mask.to(device)
|
|
||||||
|
|
||||||
# Perform inference
|
# %%
|
||||||
with torch.no_grad():
|
|
||||||
logits = model(
|
|
||||||
input_ids,
|
|
||||||
attention_mask).logits
|
|
||||||
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
|
||||||
pred_labels.extend(predicted_class_ids)
|
|
||||||
|
|
||||||
pred_labels = [tensor.item() for tensor in pred_labels]
|
max_length = 64
|
||||||
|
|
||||||
|
# given a dataset entry, run it through the tokenizer
|
||||||
# %%
|
def preprocess_function(example):
|
||||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
input = example['text']
|
||||||
y_true = actual_labels
|
# text_target sets the corresponding label to inputs
|
||||||
y_pred = pred_labels
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = tokenizer(
|
||||||
# Compute metrics
|
input,
|
||||||
accuracy = accuracy_score(y_true, y_pred)
|
max_length=max_length,
|
||||||
f1 = f1_score(y_true, y_pred, average='macro')
|
# truncation=True,
|
||||||
precision = precision_score(y_true, y_pred, average='macro')
|
padding='max_length'
|
||||||
recall = recall_score(y_true, y_pred, average='macro')
|
)
|
||||||
|
return model_inputs
|
||||||
# Print the results
|
|
||||||
print(f'Accuracy: {accuracy:.2f}')
|
# map maps function to each "row" in the dataset
|
||||||
print(f'F1 Score: {f1:.2f}')
|
# aka the data in the immediate nesting
|
||||||
print(f'Precision: {precision:.2f}')
|
datasets = test_dataset.map(
|
||||||
print(f'Recall: {recall:.2f}')
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
||||||
|
|
||||||
|
# %% temp
|
||||||
|
# tokenized_datasets['train'].rename_columns()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create data collator
|
||||||
|
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# compute metrics
|
||||||
|
# metric = evaluate.load("accuracy")
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def compute_metrics(eval_preds):
|
||||||
|
# preds, labels = eval_preds
|
||||||
|
# preds = np.argmax(preds, axis=1)
|
||||||
|
# return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_checkpoint,
|
||||||
|
num_labels=len(mdm_list),
|
||||||
|
id2label=id2label,
|
||||||
|
label2id=label2id)
|
||||||
|
# important! after extending tokens vocab
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
pred_labels = []
|
||||||
|
actual_labels = []
|
||||||
|
|
||||||
|
|
||||||
|
BATCH_SIZE = 64
|
||||||
|
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
|
||||||
|
for batch in tqdm(dataloader):
|
||||||
|
# Inference in batches
|
||||||
|
input_ids = batch['input_ids']
|
||||||
|
attention_mask = batch['attention_mask']
|
||||||
|
# save labels too
|
||||||
|
actual_labels.extend(batch['label'])
|
||||||
|
|
||||||
|
|
||||||
|
# Move to GPU if available
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
attention_mask = attention_mask.to(device)
|
||||||
|
|
||||||
|
# Perform inference
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask).logits
|
||||||
|
predicted_class_ids = logits.argmax(dim=1).to("cpu")
|
||||||
|
pred_labels.extend(predicted_class_ids)
|
||||||
|
|
||||||
|
pred_labels = [tensor.item() for tensor in pred_labels]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||||
|
y_true = actual_labels
|
||||||
|
y_pred = pred_labels
|
||||||
|
|
||||||
|
# Compute metrics
|
||||||
|
accuracy = accuracy_score(y_true, y_pred)
|
||||||
|
f1 = f1_score(y_true, y_pred, average='macro')
|
||||||
|
precision = precision_score(y_true, y_pred, average='macro')
|
||||||
|
recall = recall_score(y_true, y_pred, average='macro')
|
||||||
|
|
||||||
|
# Print the results
|
||||||
|
print(f'Accuracy: {accuracy:.5f}')
|
||||||
|
print(f'F1 Score: {f1:.5f}')
|
||||||
|
print(f'Precision: {precision:.5f}')
|
||||||
|
print(f'Recall: {recall:.5f}')
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
for fold in [1,2,3,4,5]:
|
||||||
|
test(fold)
|
||||||
|
|
|
@ -53,13 +53,15 @@ def process_df_to_dict(df, mdm_list):
|
||||||
output_list = []
|
output_list = []
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
desc = f"<DESC>{row['tag_description']}<DESC>"
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
|
||||||
pattern = row['pattern']
|
pattern = row['pattern']
|
||||||
try:
|
try:
|
||||||
index = mdm_list.index(pattern)
|
index = mdm_list.index(pattern)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
index = -1
|
index = -1
|
||||||
element = {
|
element = {
|
||||||
'text' : f"{desc}",
|
'text' : f"{desc}{unit}",
|
||||||
'label': index,
|
'label': index,
|
||||||
}
|
}
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
@ -69,7 +71,7 @@ def process_df_to_dict(df, mdm_list):
|
||||||
|
|
||||||
def create_split_dataset(fold, mdm_list):
|
def create_split_dataset(fold, mdm_list):
|
||||||
# train
|
# train
|
||||||
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
# valid
|
# valid
|
||||||
|
@ -86,126 +88,124 @@ def create_split_dataset(fold, mdm_list):
|
||||||
# %%
|
# %%
|
||||||
|
|
||||||
# function to perform training for a given fold
|
# function to perform training for a given fold
|
||||||
# def train(fold):
|
def train(fold):
|
||||||
fold = 1
|
|
||||||
|
|
||||||
save_path = f'checkpoint_fold_{fold}'
|
save_path = f'checkpoint_fold_{fold}'
|
||||||
split_datasets = create_split_dataset(fold, mdm_list)
|
split_datasets = create_split_dataset(fold, mdm_list)
|
||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
|
|
||||||
model_checkpoint = "distilbert/distilbert-base-uncased"
|
model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
# Define additional special tokens
|
# Define additional special tokens
|
||||||
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
||||||
# Add the additional special tokens to the tokenizer
|
# Add the additional special tokens to the tokenizer
|
||||||
# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
max_length = 120
|
max_length = 120
|
||||||
|
|
||||||
# given a dataset entry, run it through the tokenizer
|
# given a dataset entry, run it through the tokenizer
|
||||||
def preprocess_function(example):
|
def preprocess_function(example):
|
||||||
input = example['text']
|
input = example['text']
|
||||||
# text_target sets the corresponding label to inputs
|
# text_target sets the corresponding label to inputs
|
||||||
# there is no need to create a separate 'labels'
|
# there is no need to create a separate 'labels'
|
||||||
model_inputs = tokenizer(
|
model_inputs = tokenizer(
|
||||||
input,
|
input,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
padding=True
|
padding=True
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
tokenized_datasets = split_datasets.map(
|
||||||
|
preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=8,
|
||||||
|
remove_columns="text",
|
||||||
)
|
)
|
||||||
return model_inputs
|
|
||||||
|
|
||||||
# map maps function to each "row" in the dataset
|
# %% temp
|
||||||
# aka the data in the immediate nesting
|
# tokenized_datasets['train'].rename_columns()
|
||||||
tokenized_datasets = split_datasets.map(
|
|
||||||
preprocess_function,
|
|
||||||
batched=True,
|
|
||||||
num_proc=8,
|
|
||||||
remove_columns="text",
|
|
||||||
)
|
|
||||||
|
|
||||||
# %% temp
|
# %%
|
||||||
# tokenized_datasets['train'].rename_columns()
|
# create data collator
|
||||||
# %% temp
|
|
||||||
tokenized_datasets['train']['input_ids']
|
|
||||||
|
|
||||||
# %%
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||||
# create data collator
|
|
||||||
|
|
||||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
# %%
|
||||||
|
# compute metrics
|
||||||
# %%
|
metric = evaluate.load("accuracy")
|
||||||
# compute metrics
|
|
||||||
metric = evaluate.load("accuracy")
|
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(eval_preds):
|
def compute_metrics(eval_preds):
|
||||||
preds, labels = eval_preds
|
preds, labels = eval_preds
|
||||||
preds = np.argmax(preds, axis=1)
|
preds = np.argmax(preds, axis=1)
|
||||||
return metric.compute(predictions=preds, references=labels)
|
return metric.compute(predictions=preds, references=labels)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# create id2label and label2id
|
# create id2label and label2id
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
model_checkpoint,
|
model_checkpoint,
|
||||||
num_labels=len(mdm_list),
|
num_labels=len(mdm_list),
|
||||||
id2label=id2label,
|
id2label=id2label,
|
||||||
label2id=label2id)
|
label2id=label2id)
|
||||||
# important! after extending tokens vocab
|
# important! after extending tokens vocab
|
||||||
model.resize_token_embeddings(len(tokenizer))
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
# model = torch.compile(model, backend="inductor", dynamic=True)
|
# model = torch.compile(model, backend="inductor", dynamic=True)
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Trainer
|
# Trainer
|
||||||
|
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir=f"{save_path}",
|
output_dir=f"{save_path}",
|
||||||
eval_strategy="epoch",
|
# eval_strategy="epoch",
|
||||||
logging_dir="tensorboard-log",
|
eval_strategy="no",
|
||||||
logging_strategy="epoch",
|
logging_dir="tensorboard-log",
|
||||||
save_strategy="epoch",
|
logging_strategy="epoch",
|
||||||
load_best_model_at_end=True,
|
# save_strategy="epoch",
|
||||||
learning_rate=2e-5,
|
load_best_model_at_end=False,
|
||||||
per_device_train_batch_size=64,
|
learning_rate=1e-5,
|
||||||
per_device_eval_batch_size=64,
|
per_device_train_batch_size=64,
|
||||||
auto_find_batch_size=False,
|
per_device_eval_batch_size=64,
|
||||||
ddp_find_unused_parameters=False,
|
auto_find_batch_size=False,
|
||||||
weight_decay=0.01,
|
ddp_find_unused_parameters=False,
|
||||||
save_total_limit=1,
|
weight_decay=0.01,
|
||||||
num_train_epochs=40,
|
save_total_limit=1,
|
||||||
bf16=True,
|
num_train_epochs=80,
|
||||||
push_to_hub=False,
|
bf16=True,
|
||||||
remove_unused_columns=False,
|
push_to_hub=False,
|
||||||
)
|
remove_unused_columns=False,
|
||||||
|
)
|
||||||
|
|
||||||
trainer = Trainer(
|
|
||||||
model,
|
trainer = Trainer(
|
||||||
training_args,
|
model,
|
||||||
train_dataset=tokenized_datasets["train"],
|
training_args,
|
||||||
eval_dataset=tokenized_datasets["validation"],
|
train_dataset=tokenized_datasets["train"],
|
||||||
tokenizer=tokenizer,
|
eval_dataset=tokenized_datasets["validation"],
|
||||||
data_collator=data_collator,
|
tokenizer=tokenizer,
|
||||||
compute_metrics=compute_metrics,
|
data_collator=data_collator,
|
||||||
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
compute_metrics=compute_metrics,
|
||||||
)
|
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
|
||||||
|
)
|
||||||
# uncomment to load training from checkpoint
|
|
||||||
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
# uncomment to load training from checkpoint
|
||||||
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
# checkpoint_path = 'default_40_1/checkpoint-5600'
|
||||||
|
# trainer.train(resume_from_checkpoint=checkpoint_path)
|
||||||
trainer.train()
|
|
||||||
|
trainer.train()
|
||||||
# # execute training
|
|
||||||
# for fold in [1,2,3,4,5]:
|
# execute training
|
||||||
# print(fold)
|
for fold in [1,2,3,4,5]:
|
||||||
# train(fold)
|
print(fold)
|
||||||
|
train(fold)
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -52,11 +52,17 @@ class Inference():
|
||||||
print("preparing dataloader")
|
print("preparing dataloader")
|
||||||
# convert each dataframe row into a dictionary
|
# convert each dataframe row into a dictionary
|
||||||
# outputs a list of dictionaries
|
# outputs a list of dictionaries
|
||||||
|
|
||||||
def _process_df(df):
|
def _process_df(df):
|
||||||
output_list = [{
|
output_list = []
|
||||||
'input': f"<DESC>{row['tag_description']}<DESC>",
|
for _, row in df.iterrows():
|
||||||
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
} for _, row in df.iterrows()]
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
element = {
|
||||||
|
'input' : f"{desc}{unit}",
|
||||||
|
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
return output_list
|
return output_list
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
Accuracy for fold 1: 0.943208707998107
|
Accuracy for fold 1: 0.9687647893989588
|
||||||
Accuracy for fold 2: 0.9214953271028037
|
Accuracy for fold 2: 0.9565420560747664
|
||||||
Accuracy for fold 3: 0.9728915662650602
|
Accuracy for fold 3: 0.9708835341365462
|
||||||
Accuracy for fold 4: 0.967174119885823
|
Accuracy for fold 4: 0.9881065651760228
|
||||||
Accuracy for fold 5: 0.9097572148419606
|
Accuracy for fold 5: 0.9225836005497022
|
||||||
|
|
|
@ -32,8 +32,9 @@ def process_df_to_dict(df):
|
||||||
output_list = []
|
output_list = []
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
desc = f"<DESC>{row['tag_description']}<DESC>"
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
element = {
|
element = {
|
||||||
'input' : f"{desc}",
|
'input' : f"{desc}{unit}",
|
||||||
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
|
'output': f"<THING_START>{row['thing_pattern']}<THING_END><PROPERTY_START>{row['property_pattern']}<PROPERTY_END>",
|
||||||
}
|
}
|
||||||
output_list.append(element)
|
output_list.append(element)
|
||||||
|
@ -43,7 +44,7 @@ def process_df_to_dict(df):
|
||||||
|
|
||||||
def create_split_dataset(fold):
|
def create_split_dataset(fold):
|
||||||
# train
|
# train
|
||||||
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
# valid
|
# valid
|
||||||
|
@ -150,11 +151,12 @@ def train(fold):
|
||||||
|
|
||||||
args = Seq2SeqTrainingArguments(
|
args = Seq2SeqTrainingArguments(
|
||||||
f"{save_path}",
|
f"{save_path}",
|
||||||
eval_strategy="epoch",
|
# eval_strategy="epoch",
|
||||||
|
eval_strategy="no",
|
||||||
logging_dir="tensorboard-log",
|
logging_dir="tensorboard-log",
|
||||||
logging_strategy="epoch",
|
logging_strategy="epoch",
|
||||||
save_strategy="epoch",
|
# save_strategy="epoch",
|
||||||
load_best_model_at_end=True,
|
load_best_model_at_end=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-3,
|
||||||
per_device_train_batch_size=64,
|
per_device_train_batch_size=64,
|
||||||
per_device_eval_batch_size=64,
|
per_device_eval_batch_size=64,
|
||||||
|
@ -162,7 +164,7 @@ def train(fold):
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
weight_decay=0.01,
|
weight_decay=0.01,
|
||||||
save_total_limit=1,
|
save_total_limit=1,
|
||||||
num_train_epochs=20,
|
num_train_epochs=40,
|
||||||
predict_with_generate=True,
|
predict_with_generate=True,
|
||||||
bf16=True,
|
bf16=True,
|
||||||
push_to_hub=False,
|
push_to_hub=False,
|
||||||
|
|
Loading…
Reference in New Issue