From 7699201cb84076df12df045dbdcb0f45dec82aef Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Mon, 11 Nov 2024 20:20:43 +0900 Subject: [PATCH] Feat: implement selection for pattern-mapping Feat: added error analysis for BERT find-back Feat: added direct mapping with unit Feat: added BERT for classification using description only --- analysis/bert/.gitignore | 1 + analysis/bert/error_analysis.py | 252 +++++++++++ analysis/pattern_filling/.gitignore | 1 + analysis/pattern_filling/analysis.py | 55 +++ .../abbreviations/replacement_dict.py | 14 +- .../selection_with_pattern/.gitignore | 2 + post_process/selection_with_pattern/run.py | 261 +++++++++++ .../selection_with_pattern/run_diagnostic.py | 407 ++++++++++++++++++ post_process/selection_with_pattern/utils.py | 82 ++++ train/classification_bert_desc/.gitignore | 2 + .../classification_prediction/predict.py | 228 ++++++++++ train/classification_bert_desc/train.py | 209 +++++++++ train/mapping_with_unit/.gitignore | 2 + .../mapping_prediction/.gitignore | 2 + .../mapping_prediction/inference.py | 168 ++++++++ .../mapping_prediction/predict.py | 71 +++ train/mapping_with_unit/train.py | 197 +++++++++ 17 files changed, 1950 insertions(+), 4 deletions(-) create mode 100644 analysis/bert/error_analysis.py create mode 100644 analysis/pattern_filling/.gitignore create mode 100644 analysis/pattern_filling/analysis.py create mode 100644 post_process/selection_with_pattern/.gitignore create mode 100644 post_process/selection_with_pattern/run.py create mode 100644 post_process/selection_with_pattern/run_diagnostic.py create mode 100644 post_process/selection_with_pattern/utils.py create mode 100644 train/classification_bert_desc/.gitignore create mode 100644 train/classification_bert_desc/classification_prediction/predict.py create mode 100644 train/classification_bert_desc/train.py create mode 100644 train/mapping_with_unit/.gitignore create mode 100644 train/mapping_with_unit/mapping_prediction/.gitignore create mode 100644 train/mapping_with_unit/mapping_prediction/inference.py create mode 100644 train/mapping_with_unit/mapping_prediction/predict.py create mode 100644 train/mapping_with_unit/train.py diff --git a/analysis/bert/.gitignore b/analysis/bert/.gitignore index bee8a64..2fb7073 100644 --- a/analysis/bert/.gitignore +++ b/analysis/bert/.gitignore @@ -1 +1,2 @@ __pycache__ +exports diff --git a/analysis/bert/error_analysis.py b/analysis/bert/error_analysis.py new file mode 100644 index 0000000..e48defd --- /dev/null +++ b/analysis/bert/error_analysis.py @@ -0,0 +1,252 @@ +# %% +import pandas as pd +from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np + +def analysis_for_fold(fold): + file_object = open(f'exports/output_{fold}.txt', 'w') + + # %% + data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv' + df = pd.read_csv(data_path, skipinitialspace=True) + + # %% + # subset to mdm + df = df[df['MDM']] + + thing_condition = df['p_thing'] == df['thing_pattern'] + error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']] + + property_condition = df['p_property'] == df['property_pattern'] + error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']] + + correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']] + + test_df = df + + # %% + print("Number of errors related to 'thing'", len(error_thing_df), file=file_object) + print("Number of errors related to 'property'", len(error_property_df), file=file_object) + + # %% + # thing_df.to_html('thing_errors.html') + # property_df.to_html('property_errors.html') + + ########################################## + # what we need now is understand why the model is making these mispredictions + # import train data and test data + # %% + class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = f"{desc}{unit}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + + + + # %% + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + checkpoint_directory = "../../train/classification_bert" + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + test_embedder = Embedder(input_df=test_df) + test_embeds = test_embedder.make_embedding(checkpoint_path) + + + + # %% + # test embeds are inputs since we are looking back at train data + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() + + # %% + # the following function takes in a full cos_sim_matrix + # condition_source: boolean selectors of the source embedding + # condition_target: boolean selectors of the target embedding + def find_closest(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top 5 maximum values along axis 1 + top_k = 3 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # note that top_k_indices is a nested list because of the 2d nature of the matrix + # the result is flipped + top_k_indices[0] = top_k_indices[0][::-1] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + + return top_k_indices, top_k_values + + + #################################################### + # special find-back code + # %% + def find_back_element_with_print(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + + top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list() + training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list() + + test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list() + test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list() + test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0] + predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property'] + predicted_test_data = predicted_test_data.to_list()[0] + + print("*" * 20, file=file_object) + print("idx:", select_idx, file=file_object) + print("train desc", training_desc_list, file=file_object) + print("train thing+property", training_data_pattern_list, file=file_object) + print("test desc", test_desc_list, file=file_object) + print("test thing+property", test_data_pattern_list, file=file_object) + print("predicted thing+property", predicted_test_data, file=file_object) + print("ships idx", test_ship_id, file=file_object) + print("score:", top_k_values[0], file=file_object) + + test_pattern = test_data_pattern_list[0] + + find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ] + + if sum(find_back_list) > 0: + return True + else: + return False + + + + # %% + # for error thing + print('\n', file=file_object) + print('*' * 80, file=file_object) + print('Error analysis for thing errors', file=file_object) + pattern_in_train = [] + for select_idx in error_thing_df.index: + result = find_back_element_with_print(select_idx) + print("status:", result, file=file_object) + pattern_in_train.append(result) + + + proportion_in_train = sum(pattern_in_train)/len(pattern_in_train) + + print('\n', file=file_object) + print('*' * 80, file=file_object) + print("Proportion of entries found in training data", proportion_in_train, file=file_object) + + # for error property + # %% + print('\n', file=file_object) + print('*' * 80, file=file_object) + print('Error analysis for property errors', file=file_object) + pattern_in_train = [] + for select_idx in error_property_df.index: + result = find_back_element_with_print(select_idx) + print("status:", result, file=file_object) + pattern_in_train.append(result) + + proportion_in_train = sum(pattern_in_train)/len(pattern_in_train) + + print('\n', file=file_object) + print('*' * 80, file=file_object) + print("Proportion of entries found in training data", proportion_in_train, file=file_object) + + #################################################### + + # %% + # make function to compute similarity of closest retrieved result + def compute_similarity(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + return np.mean(top_k_values[0]) + + # %% + def print_summary(similarity_scores): + # Convert list to numpy array for additional stats + np_array = np.array(similarity_scores) + + # Get stats + mean_value = np.mean(np_array) + percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles + + # Display numpy results + print("Mean:", mean_value, file=file_object) + print("25th, 50th, 75th Percentiles:", percentiles, file=file_object) + + + # %% + ########################################## + # Analyze the degree of similarity differences between correct and incorrect results + + print('\n', file=file_object) + print("*" * 80, file=file_object) + print("This section analyzes the similarity statistics for the error and correct groups", file=file_object) + # %% + # compute similarity scores for all values in error_thing_df + similarity_thing_scores = [] + for idx in error_thing_df.index: + similarity_thing_scores.append(compute_similarity(idx)) + print_summary(similarity_thing_scores) + + + # %% + similarity_property_scores = [] + for idx in error_property_df.index: + similarity_property_scores.append(compute_similarity(idx)) + print_summary(similarity_property_scores) + + # %% + similarity_correct_scores = [] + for idx in correct_df.index: + similarity_correct_scores.append(compute_similarity(idx)) + print_summary(similarity_correct_scores) + + file_object.close() + +for fold in [1,2,3,4,5]: + print(f"running for fold {fold}") + analysis_for_fold(fold) \ No newline at end of file diff --git a/analysis/pattern_filling/.gitignore b/analysis/pattern_filling/.gitignore new file mode 100644 index 0000000..0ef6d10 --- /dev/null +++ b/analysis/pattern_filling/.gitignore @@ -0,0 +1 @@ +output.csv \ No newline at end of file diff --git a/analysis/pattern_filling/analysis.py b/analysis/pattern_filling/analysis.py new file mode 100644 index 0000000..67dc62b --- /dev/null +++ b/analysis/pattern_filling/analysis.py @@ -0,0 +1,55 @@ +# we want to see if there are clear rules to filling numbers in the pattern +# format + +# %% +# %% +import pandas as pd +# from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np + +# %% +fold = 5 +data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv' +test_df = pd.read_csv(data_path, skipinitialspace=True) + +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) + + + +# %% +data_path = '../../data_import/exports/data_mapping_mdm.csv' +# data_path = '../../data_preprocess/exports/preprocessed_data.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(df['pattern'])))) + + + +# %% +symbol_pattern_list = [elem for elem in mdm_list if '#' in elem] + +# %% +symbol_pattern_list + +# %% +len(symbol_pattern_list) + +# %% +idx = 22 +print(symbol_pattern_list[idx]) +condition1 = df['pattern'] == symbol_pattern_list[idx] +subset_df = df[df['pattern'] == symbol_pattern_list[idx]] +ship = list(set(subset_df['ships_idx'])) +print(ship) + +# %% +subset_df[['thing', 'property', 'tag_name', 'tag_description', 'ships_idx']].to_csv('output.csv') +# %% +ship_idx = 10 +condition2 = df['ships_idx'] == ship_idx +subset_df = df[condition1 & condition2] +subset_df + +# %% diff --git a/data_preprocess/abbreviations/replacement_dict.py b/data_preprocess/abbreviations/replacement_dict.py index 1034678..18f03fa 100644 --- a/data_preprocess/abbreviations/replacement_dict.py +++ b/data_preprocess/abbreviations/replacement_dict.py @@ -37,6 +37,9 @@ desc_replacement_dict = { r'\bOUTL\.\b': 'OUTLET', r'\boutlet\b\b': 'OUTLET', r'\bOUTLET\b\b': 'OUTLET', + # bunker tank + r'\bBK\b': 'BUNKER', + r'\bTK\b': 'TANK', # pressure r'\bPRESS\b\b': 'PRESSURE', r'\bPRESS\.\b': 'PRESSURE', @@ -100,12 +103,13 @@ desc_replacement_dict = { r'\bHT\b\b': 'HIGH TEMPERATURE', # auxiliary boiler # replace these first before replacing AUXILIARY only - r'\bAUX\.BOILER\b\b': 'AUXILIARY BOILER', - r'\bAUX\. BOILER\b\b': 'AUXILIARY BOILER', - r'\bAUX BLR\b\b': 'AUXILIARY BOILER', - r'\bAUX\.\b': 'AUXILIARY', + r'\bAUX\.BOILER\b': 'AUXILIARY BOILER', + r'\bAUX\. BOILER\b': 'AUXILIARY BOILER', + r'\bAUX BLR\b': 'AUXILIARY BOILER', + r'\bAUX\.\b': 'AUXILIARY ', # composite boiler r'\bCOMP\. BOILER\b\b': 'COMPOSITE BOILER', + r'\bCOMP\.BOILER\b\b': 'COMPOSITE BOILER', r'\bCOMP BOILER\b\b': 'COMPOSITE BOILER', r'\bWIND\.\b': 'WINDING', r'\bWINDING\b\b': 'WINDING', @@ -127,6 +131,8 @@ desc_replacement_dict = { r'\bTURBOCHARGER\b\b': 'TURBOCHARGER', # misc spelling errors r'\bOPERATOIN\b': 'OPERATION', + # wrongly attached terms + r'BOILERMGO': 'BOILER MGO', # additional standardizing replacement # replace # followed by a number with NO r'#(?=\d)\b': 'NO', diff --git a/post_process/selection_with_pattern/.gitignore b/post_process/selection_with_pattern/.gitignore new file mode 100644 index 0000000..4774441 --- /dev/null +++ b/post_process/selection_with_pattern/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports \ No newline at end of file diff --git a/post_process/selection_with_pattern/run.py b/post_process/selection_with_pattern/run.py new file mode 100644 index 0000000..f42985e --- /dev/null +++ b/post_process/selection_with_pattern/run.py @@ -0,0 +1,261 @@ +# %% +import pandas as pd +import os +import glob +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +import numpy as np +from utils import BertEmbedder, cosine_similarity_chunked + + +# %% +# directory for checkpoints +checkpoint_directory = '../../train/mapping_pattern' + +fold = 5 +# import test data +data_path = f"../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv" +df = pd.read_csv(data_path, skipinitialspace=True) + +# get target data +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) +# processing to help with selection later + +# %% +df['p_pattern'] = df['p_thing'] + " " + df['p_property'] + +# %% +# obtain the full mdm_list +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +full_mdm_pattern_list = sorted(list((set(full_df['pattern'])))) + +# %% +# we have to split into per-ship analysis +ships_list = sorted(list(set(df['ships_idx']))) +# %% +# for ship_idx in ships_list: +ship_idx = 1009 # choose an example ship +ship_df = df[df['ships_idx'] == ship_idx].reset_index(drop=True) + +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + embedder = BertEmbedder(train_data, checkpoint_path) + embedder.make_embedding(batch_size=64) + return embedder.embeddings.to('cpu') + + + +# %% +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) + +checkpoint_directory = "../../train/classification_bert" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +train_embedder = Embedder(input_df=train_df) +train_embeds = train_embedder.make_embedding(checkpoint_path) + +test_embedder = Embedder(input_df=ship_df) +test_embeds = test_embedder.make_embedding(checkpoint_path) + + + +# %% +# test embeds are inputs since we are looking back at train data +cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() + + + +# The general idea: +# step 1: keep only pattern generations that belong to mdm list +# -> this removes totally wrong datasets that mapped to totally wrong things +# step 2: loop through the mdm list and isolate data in both train and test that +# belong to the same pattern class +# -> this is more tricky, because we have non-mdm mapping to correct classes +# -> so we have to find which candidate is most similar to the training data + +# it is very tricky to keep track of classification across multiple stages so we +# will use a boolean answer list + +# %% +answer_list = np.ones(len(ship_df), dtype=bool) + +########################################## +# %% +# STEP 1 +# we want to loop through the the ship_df and find which ones match our full_mdm_list +pattern_match_mask = ship_df['p_pattern'].apply(lambda x: x in full_mdm_pattern_list).to_numpy() +# we assign only those that are False to our answer list +# right now the 2 arrays are basically equal +answer_list[~pattern_match_mask] = False + +# %% TEMP +print('proportion belonging to mdm classes', sum(pattern_match_mask)/len(pattern_match_mask)) + +# %% TEMP +y_true = ship_df['MDM'].to_list() +y_pred = pattern_match_mask + +# Compute metrics +accuracy = accuracy_score(y_true, y_pred) +print(f'Accuracy: {accuracy:.5f}') + +# we can see that the accuracy is not good +# %% +######################################### +# STEP 2 +# we want to go through each mdm class label +# but we do not want to make subsets of dataframes +# we will make heavy use of boolean masks + +# we want to identify per-ship mdm classes +ship_mdm_classes = sorted(set(ship_df['p_pattern'][pattern_match_mask].to_list())) + +# %% +len(ship_mdm_classes) + +# %% +for idx,select_class in enumerate(ship_mdm_classes): + print(idx, len(ship_df[ship_df['p_pattern'] == select_class])) + +# %% +select_class = ship_mdm_classes[22] +sample_df = ship_df[ship_df['p_pattern'] == select_class] + +# %% +# we need to set all idx of chosen entries as False in answer_list +selected_idx_list = sample_df.index.to_list() +answer_list[selected_idx_list] = False + +# %% +# because we have variants of a tag_description, we cannot choose 1 from the +# given candidates we have to first group the candidates, and then choose which +# group is most similar + +# %% +from fuzzywuzzy import fuzz + +# the purpose of this function is to group the strings that are similar to each other +# we need to form related groups of inputs +def group_similar_strings(obj_list, threshold=80): + groups = [] + processed_strings = set() # To keep track of already grouped strings + + for obj in obj_list: + # tuple is (idx, string) + if obj in processed_strings: + continue + + # Find all strings similar to the current string above the threshold + similar_strings = [s for s in obj_list if s[1] != obj[1] and fuzz.ratio(obj[1], s[1]) >= threshold] + + # Add the original string to the similar group + similar_group = [obj] + similar_strings + + # Mark all similar strings as processed + processed_strings.update(similar_group) + + # Add the group to the list of groups + groups.append(similar_group) + + return groups + +# Example usage +string_list = sample_df['tag_description'].to_list() +index_list = sample_df.index.to_list() +obj_list = list(zip(index_list, string_list)) +groups = group_similar_strings(obj_list, threshold=90) +print(groups) + +# %% +# this function takes in groups of related terms and create candidate entries +def make_candidates(groups): + candidates = [] + for group in groups: + first_tuple = group[0] + # string_of_tuple = first_tuple[1] + id_of_tuple = first_tuple[0] + candidates.append(id_of_tuple) + return candidates + +# %% +test_candidates = make_candidates(groups) +test_candidates_mask = np.zeros(len(ship_df), dtype=bool) +test_candidates_mask[test_candidates] = True + +# %% +train_candidates_mask = (train_df['pattern'] == select_class).to_numpy() + +# %% +# we need to make the cos_sim_matrix +# for that, we need to generate the embeddings of the ship_df (test embedding) +# and the train_df (train embeddin) + +# we then use the selection function using the given mask to choose the most +# appropriate candidate + +# the selection function takes in the full cos_sim_matrix then subsets the +# matrix according to the test_candidates_mask and train_candidates_mask that we +# give it +# it returns the most likely source candidate index and score among the source +# candidate list - aka it returns a local idx +def selection(cos_sim_matrix, source_mask, target_mask): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(source_mask, target_mask)] + # we select top-k here + # Get the indices of the top-k maximum values along axis 1 + top_k = 1 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + # Calculate the average of the top-k scores along axis 1 + y_scores = np.mean(top_k_values, axis=1) + max_idx = np.argmax(y_scores) + max_score = y_scores[max_idx] + + return max_idx, max_score + + +# %% +max_idx, max_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask) + +# %% +# after obtaining best group, we set all candidates of the group as True +chosen_group = groups[max_idx] +chosen_idx = [tuple[0] for tuple in chosen_group] + +# %% +# before doing this, we have to use the max_score and evaluate if its close enough +THRESHOLD = 0.8 +if max_score > THRESHOLD: + answer_list[chosen_idx] = True + +# %% diff --git a/post_process/selection_with_pattern/run_diagnostic.py b/post_process/selection_with_pattern/run_diagnostic.py new file mode 100644 index 0000000..acaa6cd --- /dev/null +++ b/post_process/selection_with_pattern/run_diagnostic.py @@ -0,0 +1,407 @@ +# %% +import pandas as pd +import os +import glob +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +import numpy as np +from utils import BertEmbedder, cosine_similarity_chunked +from fuzzywuzzy import fuzz + +################## +# global parameters +DIAGNOSTIC = True +THRESHOLD = 0.90 +FUZZY_SIM_THRESHOLD=90 +checkpoint_directory = "../../train/classification_bert" + +################### +# %% +# helper functions +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + embedder = BertEmbedder(train_data, checkpoint_path) + embedder.make_embedding(batch_size=64) + return embedder.embeddings.to('cpu') + + + +# the purpose of this function is to group the strings that are similar to each other +# we need to form related groups of inputs +def group_similar_strings(obj_list, threshold=80): + groups = [] + processed_strings = set() # To keep track of already grouped strings + + for obj in obj_list: + # tuple is (idx, string) + if obj in processed_strings: + continue + # Find all strings similar to the current string above the threshold + similar_strings = [s for s in obj_list if s[1] != obj[1] and fuzz.ratio(obj[1], s[1]) >= threshold] + # Add the original string to the similar group + similar_group = [obj] + similar_strings + # Mark all similar strings as processed + processed_strings.update(similar_group) + # Add the group to the list of groups + groups.append(similar_group) + return groups + +# this function takes in groups of related terms and create candidate entries +def make_candidates(groups): + candidates = [] + for group in groups: + first_tuple = group[0] + # string_of_tuple = first_tuple[1] + id_of_tuple = first_tuple[0] + candidates.append(id_of_tuple) + return candidates + + +# the selection function takes in the full cos_sim_matrix then subsets the +# matrix according to the test_candidates_mask and train_candidates_mask that we +# give it +# it returns the most likely source candidate index and score among the source +# candidate list - aka it returns a local idx +def selection(cos_sim_matrix, source_mask, target_mask, file_object=None): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(source_mask, target_mask)] + # we select top-k here + # Get the indices of the top-k maximum values along axis 1 + top_k = 1 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + # Calculate the average of the top-k scores along axis 1 + y_scores = np.mean(top_k_values, axis=1) + max_idx = np.argmax(y_scores) + max_score = y_scores[max_idx] + + if DIAGNOSTIC and (file_object is not None): + print('all scores:', file=file_object) + print(y_scores, file=file_object) + + return max_idx, max_score + + + +#################### +# global level +# %% +# obtain the full mdm_list +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +full_mdm_pattern_list = sorted(list((set(full_df['pattern'])))) + + +##################### +# fold level + +def run_selection(fold): + + file_object = open(f'exports/output_{fold}.txt', 'w') + # set the fold + # import test data + data_path = f"../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df['p_pattern'] = df['p_thing'] + " " + df['p_property'] + + # get target data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # generate your embeddings + # checkpoint_directory defined at global level + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + # we can generate the train embeddings once and re-use for every ship + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + + # create global_answer array + # the purpose of this array is to track the classification state at the global + # level + global_answer = np.zeros(len(df), dtype=bool) + + global_sim = np.zeros(len(df)) + + ############################# + # ship level + # we have to split into per-ship analysis + ships_list = sorted(list(set(df['ships_idx']))) + print(ships_list) + for ship_idx in ships_list: + # ship_idx = 1001 # choose an example ship + + print(ship_idx, file=file_object) # print selected ship + ship_df = df[df['ships_idx'] == ship_idx] + # required to map local ship_answer array to global_answer array + map_local_index_to_global_index = ship_df.index.to_numpy() + ship_df = df[df['ships_idx'] == ship_idx].reset_index(drop=True) + + # generate new embeddings for each ship + test_embedder = Embedder(input_df=ship_df) + test_embeds = test_embedder.make_embedding(checkpoint_path) + + # generate the cosine sim matrix + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() + + ############################## + # selection level + # The general idea: + # step 1: keep only pattern generations that belong to mdm list + # -> this removes totally wrong datasets that mapped to totally wrong things + # step 2: loop through the mdm list and isolate data in both train and test that + # belong to the same pattern class + # -> this is more tricky, because we have non-mdm mapping to correct classes + # -> so we have to find which candidate is most similar to the training data + + # it is very tricky to keep track of classification across multiple stages so we + # will use a boolean answer list + + # initialize the local answer list + ship_answer_list = np.ones(len(ship_df), dtype=bool) + + ship_answer_sim = np.ones(len(ship_df)) + + ########### + # STEP 1 + # we want to loop through the generated class labels and find which ones match + # our pattern list + + pattern_match_mask = ship_df['p_pattern'].apply(lambda x: x in full_mdm_pattern_list).to_numpy() + # we assign only those that are False to our answer list + # right now the 2 arrays are basically equal + ship_answer_list[~pattern_match_mask] = False + + ########### + # STEP 2 + # we now go through each class found in our generated set + + # we want to identify per-ship mdm classes + ship_predicted_classes = sorted(set(ship_df['p_pattern'][pattern_match_mask].to_list())) + + # this function performs the selection given a class + # it takes in the cos_sim_matrix + # it returns the selection by mutating the answer_list + # it sets all relevant idxs to False initially, then sets the selected values to True + def selection_for_class(select_class, cos_sim_matrix, answer_list, score_list): + + # separate the global variable from function variable + answer_list = answer_list.copy() + score_list = score_list.copy() + sample_df = ship_df[ship_df['p_pattern'] == select_class] + + # we need to set all idx of chosen entries as False in answer_list + selected_idx_list = sample_df.index.to_list() + answer_list[selected_idx_list] = False + + # basic assumption check + + # group related inputs by description similarity + string_list = sample_df['tag_description'].to_list() + index_list = sample_df.index.to_list() + obj_list = list(zip(index_list, string_list)) + # groups is a list of list, where each list is composed of a + # (idx, string) tuple + groups = group_similar_strings(obj_list, threshold=FUZZY_SIM_THRESHOLD) + + if DIAGNOSTIC: + print('*' * 10, file=file_object) + print(select_class, file=file_object) + print('candidate groups', file=file_object) + print(groups, file=file_object) + + # generate the masking arrays for both test and train embeddings + # we select a tuple from each group, and use that as a candidate for selection + test_candidates = make_candidates(groups) + test_candidates_mask = np.zeros(len(ship_df), dtype=bool) + test_candidates_mask[test_candidates] = True + # we make candidates to compare against in the data sharing the same class + train_candidates_mask = (train_df['pattern'] == select_class).to_numpy() + + # perform selection + # it returns the group index that is most likely + max_idx, max_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask, file_object) + + # consolidate all idx's in the same group + chosen_group = groups[max_idx] + chosen_idx_list = [tuple[0] for tuple in chosen_group] + + if DIAGNOSTIC: + print('chosen group', file=file_object) + print(chosen_group, file=file_object) + + # before doing this, we have to use the max_score and evaluate if its close enough + if max_score > THRESHOLD: + answer_list[chosen_idx_list] = True + if DIAGNOSTIC: + print('max score', file=file_object) + print(max_score, file=file_object) + print('accepted', file=file_object) + else: + if DIAGNOSTIC: + print('max score', file=file_object) + print(max_score, file=file_object) + print('rejected', file=file_object) + + + + + # for analysis + test_candidates_mask = np.ones(len(ship_df), dtype=bool) + _, every_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask, None) + + # write the score for every idx of this class + score_list[selected_idx_list] = every_score + + return answer_list, score_list + + + # we choose one mdm class + for select_class in ship_predicted_classes: + ship_answer_list, ship_answer_sim = selection_for_class(select_class, cos_sim_matrix, ship_answer_list, ship_answer_sim) + + # we want to write back to global_answer + # first we convert local indices to global indices + local_indices = np.where(ship_answer_list)[0] + global_indices = map_local_index_to_global_index[local_indices] + global_answer[global_indices] = True + + # similarity score + global_sim[map_local_index_to_global_index] = ship_answer_sim + + if DIAGNOSTIC: + # evaluation at per-ship level + y_true = ship_df['MDM'].to_list() + y_pred = ship_answer_list + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=file_object) + print(f'F1 Score: {f1:.5f}', file=file_object) + print(f'Precision: {precision:.5f}', file=file_object) + print(f'Recall: {recall:.5f}', file=file_object) + + + + y_true = df['MDM'].to_list() + y_pred = global_answer + + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + + + file_object.close() + + return global_answer, global_sim + +# %% +for fold in [1]: + print(f'Perform selection for fold {fold}') + global_answer, global_sim = run_selection(fold) + +# %% +data_path = f"../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv" +df = pd.read_csv(data_path, skipinitialspace=True) +df['p_pattern'] = df['p_thing'] + " " + df['p_property'] +df['score'] = global_sim + +# %% +# %% +def print_summary(similarity_scores): + # Convert list to numpy array for additional stats + np_array = np.array(similarity_scores) + + # Get stats + mean_value = np.mean(np_array) + percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles + + # Display numpy results + print("Mean:", mean_value) + print("25th, 50th, 75th Percentiles:", percentiles) + +# %% +# analysis of non-mdm in predicted +df_selected = df[global_answer] +df_selected[~df_selected['MDM']] + +in_scores = df_selected[df_selected['MDM']]['score'].to_numpy() +print_summary(in_scores) + +# %% +# analysis of mdm in non-predicted +df_selected = df[~global_answer] +# df_selected +df_selected[df_selected['MDM']] + + +# %% +out_scores = df_selected[df_selected['MDM']]['score'].to_numpy() + +print_summary(out_scores) + + +# %% +import matplotlib.pyplot as plt +# Sample data +list1 = in_scores +list2 = out_scores + +# Plot histograms +bins = 20 +plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=False) +plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=False) + +# Labels and legend +plt.xlabel('Value') +plt.ylabel('Frequency') +plt.legend(loc='upper right') +plt.title('Histograms of Three Lists') + +# Show plot +plt.show() + + +# %% diff --git a/post_process/selection_with_pattern/utils.py b/post_process/selection_with_pattern/utils.py new file mode 100644 index 0000000..3205655 --- /dev/null +++ b/post_process/selection_with_pattern/utils.py @@ -0,0 +1,82 @@ +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import torch.nn.functional as F + + + +class BertEmbedder: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + def make_embedding(self, batch_size=64): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) + # get last layer + embeddings = encoder_outputs.hidden_states[-1] + # get cls token embedding + cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size) + all_embeddings.append(cls_embeddings) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=1024): + device = 'cuda' + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + batch2.to(device) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + batch1_chunk.to(device) + # Expand batch1 chunk and entire batch2 for comparison + # batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + # batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + batch2_norms = batch2.norm(dim=1, keepdim=True) + + + # Compute cosine similarity for the chunk and store it in the final tensor + # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + # Compute cosine similarity by matrix multiplication and normalizing + sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8) + + # Store the results in the appropriate part of the final tensor + cos_sim[i:i + chunk_size] = sim_chunk + + return cos_sim + diff --git a/train/classification_bert_desc/.gitignore b/train/classification_bert_desc/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/train/classification_bert_desc/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/train/classification_bert_desc/classification_prediction/predict.py b/train/classification_bert_desc/classification_prediction/predict.py new file mode 100644 index 0000000..167b7fd --- /dev/null +++ b/train/classification_bert_desc/classification_prediction/predict.py @@ -0,0 +1,228 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold, mdm_list): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + # we only use the mdm subset + test_df = test_df[test_df['MDM']].reset_index(drop=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold, mdm_list) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 64 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + BATCH_SIZE = 64 + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + + +# %% +for fold in [1,2,3,4,5]: + test(fold) diff --git a/train/classification_bert_desc/train.py b/train/classification_bert_desc/train.py new file mode 100644 index 0000000..560bd0b --- /dev/null +++ b/train/classification_bert_desc/train.py @@ -0,0 +1,209 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/train/mapping_with_unit/.gitignore b/train/mapping_with_unit/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/train/mapping_with_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/train/mapping_with_unit/mapping_prediction/.gitignore b/train/mapping_with_unit/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/mapping_with_unit/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/mapping_with_unit/mapping_prediction/inference.py b/train/mapping_with_unit/mapping_prediction/inference.py new file mode 100644 index 0000000..4e2b72f --- /dev/null +++ b/train/mapping_with_unit/mapping_prediction/inference.py @@ -0,0 +1,168 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/mapping_with_unit/mapping_prediction/predict.py b/train/mapping_with_unit/mapping_prediction/predict.py new file mode 100644 index 0000000..0b16ac2 --- /dev/null +++ b/train/mapping_with_unit/mapping_prediction/predict.py @@ -0,0 +1,71 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=256, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/mapping_with_unit/train.py b/train/mapping_with_unit/train.py new file mode 100644 index 0000000..ba77309 --- /dev/null +++ b/train/mapping_with_unit/train.py @@ -0,0 +1,197 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 64 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) +