From 737c86bc2e204de227febff681d5ca10503b7c4f Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Thu, 28 Nov 2024 11:02:22 +0900 Subject: [PATCH] Feat: added de_duplication post-processing method --- .../classification_prediction/.gitignore | 2 + ...ned_mapping_and_classification_analysis.py | 81 +++++ .../classification_prediction/output.txt | 60 ++-- .../classification_prediction/predict.py | 29 +- post_process/binary_classifier/train.py | 10 +- post_process/de_duplication/.gitignore | 2 + post_process/de_duplication/run.py | 313 ++++++++++++++++++ post_process/de_duplication/utils.py | 132 ++++++++ post_process/selection/.gitignore | 3 +- post_process/selection/output.txt | 41 +++ post_process/selection/run.py | 299 +++++++++++++++++ post_process/selection/utils.py | 57 +++- post_process/selection_old/.gitignore | 2 + .../{selection => selection_old}/predict.py | 7 +- .../{selection => selection_old}/selection.py | 18 +- post_process/selection_old/utils.py | 87 +++++ post_process/similarity_classifier/.gitignore | 2 + post_process/similarity_classifier/output.txt | 50 +-- post_process/similarity_classifier/run.py | 44 ++- .../classification_prediction/output.txt | 40 +-- train/train.bash | 16 +- 21 files changed, 1182 insertions(+), 113 deletions(-) create mode 100644 post_process/binary_classifier/classification_prediction/.gitignore create mode 100644 post_process/binary_classifier/classification_prediction/combined_mapping_and_classification_analysis.py create mode 100644 post_process/de_duplication/.gitignore create mode 100644 post_process/de_duplication/run.py create mode 100644 post_process/de_duplication/utils.py create mode 100644 post_process/selection/output.txt create mode 100644 post_process/selection/run.py create mode 100644 post_process/selection_old/.gitignore rename post_process/{selection => selection_old}/predict.py (94%) rename post_process/{selection => selection_old}/selection.py (95%) create mode 100644 post_process/selection_old/utils.py diff --git a/post_process/binary_classifier/classification_prediction/.gitignore b/post_process/binary_classifier/classification_prediction/.gitignore new file mode 100644 index 0000000..4d615d0 --- /dev/null +++ b/post_process/binary_classifier/classification_prediction/.gitignore @@ -0,0 +1,2 @@ +exports +output.txt \ No newline at end of file diff --git a/post_process/binary_classifier/classification_prediction/combined_mapping_and_classification_analysis.py b/post_process/binary_classifier/classification_prediction/combined_mapping_and_classification_analysis.py new file mode 100644 index 0000000..f8900c9 --- /dev/null +++ b/post_process/binary_classifier/classification_prediction/combined_mapping_and_classification_analysis.py @@ -0,0 +1,81 @@ +# %% +import pandas as pd + +# following code computes final mapping + classification accuracy +# %% +def run(fold): + data_path = f'exports/result_group_{fold}.csv' + df = pd.read_csv(data_path, skipinitialspace=True) + p_mdm = df['p_mdm'] + + data_path = f'../../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv' + df = pd.read_csv(data_path, skipinitialspace=True) + actual_mdm = df['MDM'] + + thing_correctness = df['thing'] == df['p_thing'] + property_correctness = df['property'] == df['p_property'] + answer = thing_correctness & property_correctness + + # if is non-MDM -> then should be unmapped + # if is MDM -> then should be mapped correctly + + # out of correctly predicted relevant data, how many are mapped correctly? + correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer) + + # number of correctly predicted non-relevant data + correct_negative_mdm = sum(~(p_mdm) & ~(actual_mdm)) + + overall_correct = (correct_positive_mdm_and_map + correct_negative_mdm)/len(actual_mdm) + print(overall_correct) +# %% +for fold in [1,2,3,4,5]: + run(fold) + +# %% +# check for "duplicates" in each ship +# we want to enforce a unique mapping +fold = 1 + +data_path = f'exports/result_group_{fold}.csv' +df = pd.read_csv(data_path, skipinitialspace=True) + +# get predicted mdm labels +p_mdm = df['p_mdm'].to_numpy() +predicted_mdm_mask = p_mdm.astype(bool) + +# %% +# get the mapped data +data_path = f'../../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv' +df = pd.read_csv(data_path, skipinitialspace=True) +df['mapping'] = df['p_thing'] + ' ' + df['p_property'] + + +# get ship list +ship_list = sorted(list(set(df['ships_idx']))) + +# assign ship +ship = ship_list[1] + +ship_boolean_mask = df['ships_idx'] == ship + +# isolate predicted mdm data of the ship +ship_predicted_mdm_mask = predicted_mdm_mask & ship_boolean_mask + +mapping_list = df['mapping'][ship_predicted_mdm_mask].to_list() + +mapping_count = {} + +for mapping in mapping_list: + if mapping in mapping_count: + mapping_count[mapping] = mapping_count[mapping] + 1 + else: + mapping_count[mapping] = 1 + +# print the mapping count +mapping_count + +# %% +# we can take one of the elements that exceeded 1 mapping and check +df_ship = df[ship_predicted_mdm_mask] +df_ship[df_ship['mapping'] == 'GeneratorEngine2 RunningState'] +# %% diff --git a/post_process/binary_classifier/classification_prediction/output.txt b/post_process/binary_classifier/classification_prediction/output.txt index d45c89c..ecd3034 100644 --- a/post_process/binary_classifier/classification_prediction/output.txt +++ b/post_process/binary_classifier/classification_prediction/output.txt @@ -1,31 +1,51 @@ ******************************************************************************** Fold: 1 -Accuracy: 0.95174 -F1 Score: 0.90912 -Precision: 0.91788 -Recall: 0.90092 +tp: 1808 +tn: 10692 +fp: 269 +fn: 305 +Accuracy: 0.95610 +F1 Score: 0.86301 +Precision: 0.87049 +Recall: 0.85566 ******************************************************************************** Fold: 2 -Accuracy: 0.95159 -F1 Score: 0.92593 -Precision: 0.91697 -Recall: 0.93574 +tp: 1932 +tn: 8304 +fp: 278 +fn: 208 +Accuracy: 0.95467 +F1 Score: 0.88828 +Precision: 0.87421 +Recall: 0.90280 ******************************************************************************** Fold: 3 -Accuracy: 0.95373 -F1 Score: 0.93021 -Precision: 0.91935 -Recall: 0.94233 +tp: 1789 +tn: 7613 +fp: 250 +fn: 203 +Accuracy: 0.95403 +F1 Score: 0.88762 +Precision: 0.87739 +Recall: 0.89809 ******************************************************************************** Fold: 4 -Accuracy: 0.96524 -F1 Score: 0.92902 -Precision: 0.91306 -Recall: 0.94702 +tp: 1967 +tn: 12929 +fp: 420 +fn: 135 +Accuracy: 0.96408 +F1 Score: 0.87636 +Precision: 0.82405 +Recall: 0.93578 ******************************************************************************** Fold: 5 -Accuracy: 0.95643 -F1 Score: 0.92319 -Precision: 0.91793 -Recall: 0.92869 +tp: 1915 +tn: 10381 +fp: 405 +fn: 268 +Accuracy: 0.94811 +F1 Score: 0.85054 +Precision: 0.82543 +Recall: 0.87723 diff --git a/post_process/binary_classifier/classification_prediction/predict.py b/post_process/binary_classifier/classification_prediction/predict.py index 4eb20cb..bcf3155 100644 --- a/post_process/binary_classifier/classification_prediction/predict.py +++ b/post_process/binary_classifier/classification_prediction/predict.py @@ -27,6 +27,9 @@ from tqdm import tqdm torch.set_float32_matmul_precision('high') + +BATCH_SIZE = 256 + # %% # %% @@ -158,7 +161,6 @@ def test(fold): actual_labels = [] - BATCH_SIZE = 64 dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) for batch in tqdm(dataloader): # Inference in batches @@ -181,6 +183,17 @@ def test(fold): pred_labels.extend(predicted_class_ids) pred_labels = [tensor.item() for tensor in pred_labels] + pred_labels = np.array(pred_labels, dtype=bool) + + # append the mdm prediction to the test_df for analysis later + df_out = pd.DataFrame({ + 'p_mdm': pred_labels, + }) + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + df_export = pd.concat([test_df, df_out], axis=1) + df_export.to_csv(f"exports/result_group_{fold}.csv", index=False) + # %% @@ -190,15 +203,23 @@ def test(fold): # Compute metrics accuracy = accuracy_score(y_true, y_pred) - f1 = f1_score(y_true, y_pred, average='macro') - precision = precision_score(y_true, y_pred, average='macro') - recall = recall_score(y_true, y_pred, average='macro') + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + cm = confusion_matrix(y_true, y_pred) + tn, fp, fn, tp = cm.ravel() with open("output.txt", "a") as f: + print('*' * 80, file=f) print(f'Fold: {fold}', file=f) # Print the results + print(f"tp: {tp}", file=f) + print(f"tn: {tn}", file=f) + print(f"fp: {fp}", file=f) + print(f"fn: {fn}", file=f) print(f'Accuracy: {accuracy:.5f}', file=f) print(f'F1 Score: {f1:.5f}', file=f) print(f'Precision: {precision:.5f}', file=f) diff --git a/post_process/binary_classifier/train.py b/post_process/binary_classifier/train.py index 8d1d307..58a8624 100644 --- a/post_process/binary_classifier/train.py +++ b/post_process/binary_classifier/train.py @@ -104,8 +104,8 @@ def train(fold): # prepare tokenizer - model_checkpoint = "distilbert/distilbert-base-uncased" - # model_checkpoint = 'google-bert/bert-base-uncased' + model_checkpoint = "distilbert/distilbert-base-cased" + # model_checkpoint = 'google-bert/bert-base-cased' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) # Define additional special tokens additional_special_tokens = ["", "", "", "", "", "", "", "", ""] @@ -180,13 +180,13 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-5, - per_device_train_batch_size=64, - per_device_eval_batch_size=64, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, bf16=True, push_to_hub=False, remove_unused_columns=False, diff --git a/post_process/de_duplication/.gitignore b/post_process/de_duplication/.gitignore new file mode 100644 index 0000000..010fe90 --- /dev/null +++ b/post_process/de_duplication/.gitignore @@ -0,0 +1,2 @@ +output* +__pycache__ \ No newline at end of file diff --git a/post_process/de_duplication/run.py b/post_process/de_duplication/run.py new file mode 100644 index 0000000..d95b1f1 --- /dev/null +++ b/post_process/de_duplication/run.py @@ -0,0 +1,313 @@ +# %% +import pandas as pd +import os +import glob +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +import numpy as np +from utils import T5Embedder, BertEmbedder, cosine_similarity_chunked +from tqdm import tqdm + +################## +# global parameters +DIAGNOSTIC = False +BATCH_SIZE = 1024 + +################### +# helper functions +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + name = f"{row['tag_name']}" + element = f"{desc}{unit}{name}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + embedder = T5Embedder(train_data, checkpoint_path) + # embedder = BertEmbedder(train_data, checkpoint_path) + embedder.make_embedding(batch_size=BATCH_SIZE) + return embedder.embeddings + + + + +# the selection function takes in the full cos_sim_matrix then subsets the +# matrix according to the test_candidates_mask and train_candidates_mask that we +# give it +# it returns the most likely source candidate index and score among the source +# candidate list +# we then map the local idx to the ship-level idx +def selection(cos_sim_matrix, source_mask, target_mask): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(source_mask, target_mask)] + # we select top-k here + # Get the indices of the top-k maximum values along axis 1 + top_k = 1 + # returns a potential 2d matrix of which columns have the highest values + # top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # this partial sorts and ensures we care only top_k are correctly sorted + top_k_indices = np.argpartition(subset_matrix, -top_k, axis=1)[:, -top_k:] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + # Calculate the average of the top-k scores along axis 1 + y_scores = np.mean(top_k_values, axis=1) + max_idx = np.argmax(y_scores) + max_score = y_scores[max_idx] + + # convert boolean to indices + condition_indices = np.where(source_mask)[0] + max_idx = condition_indices[max_idx] + + + return max_idx, max_score + + + +#################### +# global level +# obtain the full mdm_list +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +full_df['mapping'] = full_df['thing'] + ' ' + full_df['property'] +full_mdm_mapping_list = sorted(list((set(full_df['mapping'])))) + + +##################### +# fold level + +def run_selection(fold): + + # set the fold + # import test data + # data_path = f"../binary_classifier/classification_prediction/exports/result_group_{fold}.csv" + data_path = f"../similarity_classifier/exports/result_group_{fold}.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + predicted_mdm = df['p_mdm'].to_numpy().astype(bool) + + data_path = f"../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df['p_mdm'] = predicted_mdm + df['p_mapping'] = df['p_thing'] + " " + df['p_property'] + + # get target data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + train_df['mapping'] = train_df['thing'] + " " + train_df['property'] + + # generate your embeddings + # checkpoint_directory defined at global level + # checkpoint_directory = "../../train/classification_bert_pattern_desc_unit" + checkpoint_directory = "../../train/mapping_t5_complete_desc_unit_name" + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + # we can generate the train embeddings once and re-use for every ship + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + # generate new embeddings for each ship + test_embedder = Embedder(input_df=df) + global_test_embeds = test_embedder.make_embedding(checkpoint_path) + + + # create global_answer array + # the purpose of this array is to track the classification state at the global + # level + global_answer = np.zeros(len(df), dtype=bool) + + ############################# + # ship level + # we have to split into per-ship analysis + ships_list = sorted(list(set(df['ships_idx']))) + + for ship_idx in tqdm(ships_list): + # ship_df = df[df['ships_idx'] == ship_idx] + # required to map local ship_answer array to global_answer array + # map_local_index_to_global_index = ship_df.index.to_numpy() + + # we want to subset the ship and only p_mdm values + ship_mask = df['ships_idx'] == ship_idx + p_mdm_mask = df['p_mdm'] + map_local_index_to_global_index = np.where(ship_mask & p_mdm_mask)[0] + ship_df = df[ship_mask & p_mdm_mask].reset_index(drop=True) + + # subset the test embeds + test_embeds = global_test_embeds[map_local_index_to_global_index] + + # generate the cosine sim matrix for the ship level + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=1024).cpu().numpy() + + ############################## + # selection level + # The general idea: + # step 1: keep only pattern generations that belong to mdm list + # -> this removes totally wrong datasets that mapped to totally wrong things + # step 2: loop through the mdm list and isolate data in both train and test that + # belong to the same pattern class + # -> this is more tricky, because we have non-mdm mapping to correct classes + # -> so we have to find which candidate is most similar to the training data + + # it is very tricky to keep track of classification across multiple stages so we + # will use a boolean answer list to map answers back to the global answer list + + # initialize the local answer list + ship_answer_list = np.ones(len(ship_df), dtype=bool) + + ########### + # STEP 1A: ensure that the predicted mapping labels are valid + pattern_match_mask = ship_df['p_mapping'].apply(lambda x: x in full_mdm_mapping_list).to_numpy() + pattern_match_mask = pattern_match_mask.astype(bool) + # anything not in the pattern_match_mask are hallucinations + # this has the same effect as setting any wrong generations as non-mdm + ship_answer_list[~pattern_match_mask] = False + + # # STEP 1B: subset our de-duplication to use only predicted_mdm labels + # p_mdm_mask = ship_df['p_mdm'] + # # assign false to any non p_mdm entries + # ship_answer_list[~p_mdm_mask] = False + # # modify pattern_match_mask to remove any non p_mdm values + # pattern_match_mask = pattern_match_mask & p_mdm_mask + + ########### + # STEP 2 + # we now go through each class found in our generated set + + # we want to identify per-ship mdm classes + ship_predicted_classes = sorted(set(ship_df['p_mapping'][pattern_match_mask].to_list())) + + # this function performs the selection given a class + # it takes in the cos_sim_matrix + # it returns the selection by mutating the answer_list + # it sets all relevant idxs to False initially, then sets the selected values to True + def selection_for_class(select_class, cos_sim_matrix, answer_list): + + # create local copy of answer_list + ship_answer_list = answer_list.copy() + # sample_df = ship_df[ship_df['p_mapping'] == select_class] + + + # we need to set all idx of chosen entries as False in answer_list -> assume wrong by default + # selected_idx_list = sample_df.index.to_numpy() + selected_idx_list = np.where(ship_df['p_mapping'] == select_class)[0] + + # basic assumption check + + # generate the masking arrays for both test and train embeddings + # we select a tuple from each group, and use that as a candidate for selection + test_candidates_mask = ship_df['p_mapping'] == select_class + # we make candidates to compare against in the data sharing the same class + train_candidates_mask = train_df['mapping'] == select_class + + if sum(train_candidates_mask) == 0: + # it can be the case that the mdm-valid mapping class is not found in training data + # print("not found in training data", select_class) + ship_answer_list[selected_idx_list] = False + return ship_answer_list + + # perform selection + # max_idx is the id + max_idx, max_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask) + + + # set the duplicate entries to False + ship_answer_list[selected_idx_list] = False + # then only set the one unique chosen value as True + ship_answer_list[max_idx] = True + + return ship_answer_list + + # we choose one mdm class + for select_class in ship_predicted_classes: + # this resulted in big improvement + if (sum(ship_df['p_mapping'] == select_class)) > 0: + ship_answer_list = selection_for_class(select_class, cos_sim_matrix, ship_answer_list) + + # we want to write back to global_answer + # first we convert local indices to global indices + ship_local_indices = np.where(ship_answer_list)[0] + ship_global_indices = map_local_index_to_global_index[ship_local_indices] + global_answer[ship_global_indices] = True + + + if DIAGNOSTIC: + # evaluation at per-ship level + y_true = ship_df['MDM'].to_list() + y_pred = ship_answer_list + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + print(f"tp: {tp}") + print(f"tn: {tn}") + print(f"fp: {fp}") + print(f"fn: {fn}") + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + + + + with open("output.txt", "a") as f: + print(80 * '*', file=f) + print(f'Statistics for fold {fold}', file=f) + + y_true = df['MDM'].to_list() + y_pred = global_answer + + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + print(f"tp: {tp}", file=f) + print(f"tn: {tn}", file=f) + print(f"fp: {fp}", file=f) + print(f"fn: {fn}", file=f) + + # compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + # print the results + print(f'accuracy: {accuracy:.5f}', file=f) + print(f'f1 score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +# %% +for fold in [1,2,3,4,5]: + print(f'Perform selection for fold {fold}') + run_selection(fold) + + +# %% diff --git a/post_process/de_duplication/utils.py b/post_process/de_duplication/utils.py new file mode 100644 index 0000000..2c1f6ee --- /dev/null +++ b/post_process/de_duplication/utils.py @@ -0,0 +1,132 @@ +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + AutoModelForSeq2SeqLM, + DataCollatorWithPadding, +) +import torch.nn.functional as F + + + +class BertEmbedder: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + def make_embedding(self, batch_size=64): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=120) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) + # get last layer + embeddings = encoder_outputs.hidden_states[-1] + # get cls token embedding + cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size) + all_embeddings.append(cls_embeddings) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +class T5Embedder: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_embedding(self, batch_size=128): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + + +def cosine_similarity_chunked(batch1, batch2, chunk_size=1024): + device = 'cuda' + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + batch2.to(device) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + batch1_chunk.to(device) + # Expand batch1 chunk and entire batch2 for comparison + # batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + # batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + batch2_norms = batch2.norm(dim=1, keepdim=True) + + + # Compute cosine similarity for the chunk and store it in the final tensor + # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + # Compute cosine similarity by matrix multiplication and normalizing + sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8) + + # Store the results in the appropriate part of the final tensor + cos_sim[i:i + chunk_size] = sim_chunk + + return cos_sim + diff --git a/post_process/selection/.gitignore b/post_process/selection/.gitignore index d4660dc..ed8ebf5 100644 --- a/post_process/selection/.gitignore +++ b/post_process/selection/.gitignore @@ -1,2 +1 @@ -__pycache__ -output.txt +__pycache__ \ No newline at end of file diff --git a/post_process/selection/output.txt b/post_process/selection/output.txt new file mode 100644 index 0000000..94ee7be --- /dev/null +++ b/post_process/selection/output.txt @@ -0,0 +1,41 @@ + +tp: 1738 +tn: 10744 +fp: 217 +fn: 375 +accuracy: 0.95472 +f1 score: 0.85447 +Precision: 0.88900 +Recall: 0.82253 +tp: 1794 +tn: 8302 +fp: 280 +fn: 346 +accuracy: 0.94162 +f1 score: 0.85145 +Precision: 0.86500 +Recall: 0.83832 +tp: 1755 +tn: 7598 +fp: 265 +fn: 237 +accuracy: 0.94906 +f1 score: 0.87488 +Precision: 0.86881 +Recall: 0.88102 +tp: 1911 +tn: 13079 +fp: 270 +fn: 191 +accuracy: 0.97016 +f1 score: 0.89237 +Precision: 0.87620 +Recall: 0.90913 +tp: 1826 +tn: 10540 +fp: 246 +fn: 357 +accuracy: 0.95350 +f1 score: 0.85828 +Precision: 0.88127 +Recall: 0.83646 diff --git a/post_process/selection/run.py b/post_process/selection/run.py new file mode 100644 index 0000000..481795c --- /dev/null +++ b/post_process/selection/run.py @@ -0,0 +1,299 @@ +# %% +import pandas as pd +import os +import glob +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +import numpy as np +from utils import T5Embedder, BertEmbedder, cosine_similarity_chunked +from tqdm import tqdm + +################## +# global parameters +DIAGNOSTIC = False +THRESHOLD = 0.95 +BATCH_SIZE = 1024 + +################### +# helper functions +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + name = f"{row['tag_name']}" + element = f"{desc}{unit}{name}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + embedder = T5Embedder(train_data, checkpoint_path) + # embedder = BertEmbedder(train_data, checkpoint_path) + embedder.make_embedding(batch_size=BATCH_SIZE) + return embedder.embeddings + + + + +# the selection function takes in the full cos_sim_matrix then subsets the +# matrix according to the test_candidates_mask and train_candidates_mask that we +# give it +# it returns the most likely source candidate index and score among the source +# candidate list +# we then map the local idx to the ship-level idx +def selection(cos_sim_matrix, source_mask, target_mask): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(source_mask, target_mask)] + # we select top-k here + # Get the indices of the top-k maximum values along axis 1 + top_k = 1 + # returns a potential 2d matrix of which columns have the highest values + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + # Calculate the average of the top-k scores along axis 1 + y_scores = np.mean(top_k_values, axis=1) + max_idx = np.argmax(y_scores) + max_score = y_scores[max_idx] + + # convert boolean to indices + condition_indices = np.where(source_mask)[0] + max_idx = condition_indices[max_idx] + + + return max_idx, max_score + + + +#################### +# global level +# obtain the full mdm_list +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +full_df['mapping'] = full_df['thing'] + ' ' + full_df['property'] +full_mdm_mapping_list = sorted(list((set(full_df['mapping'])))) + + +##################### +# fold level + +def run_selection(fold): + + # set the fold + # import test data + data_path = f"../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + # df['p_pattern'] = df['p_thing'] + " " + df['p_property'] + df['p_mapping'] = df['p_thing'] + " " + df['p_property'] + + # get target data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + train_df['mapping'] = train_df['thing'] + " " + train_df['property'] + + # generate your embeddings + # checkpoint_directory defined at global level + # checkpoint_directory = "../../train/classification_bert_pattern_desc_unit" + checkpoint_directory = "../../train/mapping_t5_complete_desc_unit_name" + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + # we can generate the train embeddings once and re-use for every ship + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + # generate new embeddings for each ship + test_embedder = Embedder(input_df=df) + global_test_embeds = test_embedder.make_embedding(checkpoint_path) + + + # create global_answer array + # the purpose of this array is to track the classification state at the global + # level + global_answer = np.zeros(len(df), dtype=bool) + + ############################# + # ship level + # we have to split into per-ship analysis + ships_list = sorted(list(set(df['ships_idx']))) + + for ship_idx in tqdm(ships_list): + # ship_df = df[df['ships_idx'] == ship_idx] + # required to map local ship_answer array to global_answer array + # map_local_index_to_global_index = ship_df.index.to_numpy() + map_local_index_to_global_index = np.where(df['ships_idx'] == ship_idx)[0] + ship_df = df[df['ships_idx'] == ship_idx].reset_index(drop=True) + + # subset the test embeds + test_embeds = global_test_embeds[map_local_index_to_global_index] + + # generate the cosine sim matrix for the ship level + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=1024).cpu().numpy() + + ############################## + # selection level + # The general idea: + # step 1: keep only pattern generations that belong to mdm list + # -> this removes totally wrong datasets that mapped to totally wrong things + # step 2: loop through the mdm list and isolate data in both train and test that + # belong to the same pattern class + # -> this is more tricky, because we have non-mdm mapping to correct classes + # -> so we have to find which candidate is most similar to the training data + + # it is very tricky to keep track of classification across multiple stages so we + # will use a boolean answer list to map answers back to the global answer list + + # initialize the local answer list + ship_answer_list = np.ones(len(ship_df), dtype=bool) + + ########### + # STEP 1 + # we want to loop through the generated class labels and find which ones match + # our pattern list + + pattern_match_mask = ship_df['p_mapping'].apply(lambda x: x in full_mdm_mapping_list).to_numpy() + pattern_match_mask = pattern_match_mask.astype(bool) + # anything not in the pattern_match_mask are hallucinations + # this has the same effect as setting any wrong generations as non-mdm + ship_answer_list[~pattern_match_mask] = False + + ########### + # STEP 2 + # we now go through each class found in our generated set + + # we want to identify per-ship mdm classes + ship_predicted_classes = sorted(set(ship_df['p_mapping'][pattern_match_mask].to_list())) + + # this function performs the selection given a class + # it takes in the cos_sim_matrix + # it returns the selection by mutating the answer_list + # it sets all relevant idxs to False initially, then sets the selected values to True + def selection_for_class(select_class, cos_sim_matrix, answer_list): + + # create local copy of answer_list + ship_answer_list = answer_list.copy() + # sample_df = ship_df[ship_df['p_mapping'] == select_class] + + + # we need to set all idx of chosen entries as False in answer_list -> assume wrong by default + # selected_idx_list = sample_df.index.to_numpy() + selected_idx_list = np.where(ship_df['p_mapping'] == select_class)[0] + + # basic assumption check + + # generate the masking arrays for both test and train embeddings + # we select a tuple from each group, and use that as a candidate for selection + test_candidates_mask = ship_df['p_mapping'] == select_class + # we make candidates to compare against in the data sharing the same class + train_candidates_mask = train_df['mapping'] == select_class + + if sum(train_candidates_mask) == 0: + # it can be the case that the mdm-valid mapping class is not found in training data + # print("not found in training data", select_class) + ship_answer_list[selected_idx_list] = False + return ship_answer_list + + # perform selection + # max_idx is the id + max_idx, max_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask) + + + # set the duplicate entries to False + ship_answer_list[selected_idx_list] = False + # before doing this, we have to use the max_score and evaluate if its close enough + if max_score > THRESHOLD: + ship_answer_list[max_idx] = True + + return ship_answer_list + + # we choose one mdm class + for select_class in ship_predicted_classes: + # this resulted in big improvement + if (sum(ship_df['p_mapping'] == select_class)) > 0: + ship_answer_list = selection_for_class(select_class, cos_sim_matrix, ship_answer_list) + + # we want to write back to global_answer + # first we convert local indices to global indices + ship_local_indices = np.where(ship_answer_list)[0] + ship_global_indices = map_local_index_to_global_index[ship_local_indices] + global_answer[ship_global_indices] = True + + + if DIAGNOSTIC: + # evaluation at per-ship level + y_true = ship_df['MDM'].to_list() + y_pred = ship_answer_list + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + print(f"tp: {tp}") + print(f"tn: {tn}") + print(f"fp: {fp}") + print(f"fn: {fn}") + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + + + + with open("output.txt", "a") as f: + print(80 * '*', file=f) + print(f'Statistics for fold {fold}', file=f) + + y_true = df['MDM'].to_list() + y_pred = global_answer + + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + print(f"tp: {tp}", file=f) + print(f"tn: {tn}", file=f) + print(f"fp: {fp}", file=f) + print(f"fn: {fn}", file=f) + + # compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + # print the results + print(f'accuracy: {accuracy:.5f}', file=f) + print(f'f1 score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + + +# %% +for fold in [1,2,3,4,5]: + print(f'Perform selection for fold {fold}') + run_selection(fold) diff --git a/post_process/selection/utils.py b/post_process/selection/utils.py index a59e8f2..2c1f6ee 100644 --- a/post_process/selection/utils.py +++ b/post_process/selection/utils.py @@ -1,12 +1,56 @@ import torch -from tqdm import tqdm -from transformers import AutoTokenizer -from transformers import AutoModelForSeq2SeqLM +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + AutoModelForSeq2SeqLM, + DataCollatorWithPadding, +) import torch.nn.functional as F -class Retriever: +class BertEmbedder: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + def make_embedding(self, batch_size=64): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=120) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) + # get last layer + embeddings = encoder_outputs.hidden_states[-1] + # get cls token embedding + cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size) + all_embeddings.append(cls_embeddings) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +class T5Embedder: def __init__(self, input_texts, model_checkpoint): # we need to generate the embedding from list of input strings self.embeddings = [] @@ -14,7 +58,7 @@ class Retriever: model_checkpoint = model_checkpoint self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) # define additional special tokens - additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] # add the additional special tokens to the tokenizer self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) @@ -27,7 +71,7 @@ class Retriever: - def make_mean_embedding(self, batch_size=32): + def make_embedding(self, batch_size=128): all_embeddings = self.embeddings input_texts = self.inputs @@ -54,6 +98,7 @@ class Retriever: self.embeddings = all_embeddings + def cosine_similarity_chunked(batch1, batch2, chunk_size=1024): device = 'cuda' batch1_size = batch1.size(0) diff --git a/post_process/selection_old/.gitignore b/post_process/selection_old/.gitignore new file mode 100644 index 0000000..d4660dc --- /dev/null +++ b/post_process/selection_old/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +output.txt diff --git a/post_process/selection/predict.py b/post_process/selection_old/predict.py similarity index 94% rename from post_process/selection/predict.py rename to post_process/selection_old/predict.py index c5ecbce..c5f9e91 100644 --- a/post_process/selection/predict.py +++ b/post_process/selection_old/predict.py @@ -1,13 +1,14 @@ +# %% import pandas as pd import os import glob # directory for checkpoints -checkpoint_directory = '../../train/mapping_with_unit' +checkpoint_directory = '../../train/mapping_t5_complete_desc_unit_name' def select(fold): # import test data - data_path = f"../../train/mapping_with_unit/mapping_prediction/exports/result_group_{fold}.csv" + data_path = f"../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv" df = pd.read_csv(data_path, skipinitialspace=True) # get target data @@ -91,3 +92,5 @@ with open("output.txt", "w") as f: for fold in [1,2,3,4,5]: select(fold) + +# %% diff --git a/post_process/selection/selection.py b/post_process/selection_old/selection.py similarity index 95% rename from post_process/selection/selection.py rename to post_process/selection_old/selection.py index fdbf292..c292bc5 100644 --- a/post_process/selection/selection.py +++ b/post_process/selection_old/selection.py @@ -4,6 +4,12 @@ from typing import List from tqdm import tqdm from utils import Retriever, cosine_similarity_chunked + +# global parameters +THRESHOLD = 0.95 +BATCH_SIZE = 512 + +# class Selector(): input_df: pd.DataFrame reference_df: pd.DataFrame @@ -22,10 +28,10 @@ class Selector(): def generate_input_list(df): input_list = [] for _, row in df.iterrows(): - # name = f"{row['tag_name']}" + name = f"{row['tag_name']}" desc = f"{row['tag_description']}" - # element = f"{name}{desc}" - element = f"{desc}" + unit = f"{row['unit']}" + element = f"{name}{desc}{unit}" input_list.append(element) return input_list @@ -58,13 +64,13 @@ class Selector(): train_data = list(generate_input_list(self.reference_df)) # Define the directory and the pattern retriever_train = Retriever(train_data, checkpoint_path) - retriever_train.make_mean_embedding(batch_size=64) + retriever_train.make_mean_embedding(batch_size=BATCH_SIZE) train_embed = retriever_train.embeddings # take the inputs for df_sub test_data = list(generate_input_list(self.input_df)) retriever_test = Retriever(test_data, checkpoint_path) - retriever_test.make_mean_embedding(batch_size=64) + retriever_test.make_mean_embedding(batch_size=BATCH_SIZE) test_embed = retriever_test.embeddings @@ -75,7 +81,6 @@ class Selector(): tn_accumulate = 0 fp_accumulate = 0 fn_accumulate = 0 - THRESHOLD = 0.9 for ship_idx in self.ships_list: print(ship_idx) # we select a ship and select only data exhibiting MDM pattern in the predictions @@ -119,6 +124,7 @@ class Selector(): all_idx_list.append(max_idx) similarity_score.append(max_score) # implement thresholding + print(max_score) if max_score > THRESHOLD: selected_idx_list.append(max_idx) diff --git a/post_process/selection_old/utils.py b/post_process/selection_old/utils.py new file mode 100644 index 0000000..00f10fa --- /dev/null +++ b/post_process/selection_old/utils.py @@ -0,0 +1,87 @@ +import torch +from tqdm import tqdm +from transformers import AutoTokenizer +from transformers import AutoModelForSeq2SeqLM +import torch.nn.functional as F + +BATCH_SIZE = 128 + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_mean_embedding(self, batch_size=BATCH_SIZE): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=1024): + device = 'cuda' + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + batch2.to(device) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + batch1_chunk.to(device) + # Expand batch1 chunk and entire batch2 for comparison + # batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + # batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + batch2_norms = batch2.norm(dim=1, keepdim=True) + + + # Compute cosine similarity for the chunk and store it in the final tensor + # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + # Compute cosine similarity by matrix multiplication and normalizing + sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8) + + # Store the results in the appropriate part of the final tensor + cos_sim[i:i + chunk_size] = sim_chunk + + return cos_sim + diff --git a/post_process/similarity_classifier/.gitignore b/post_process/similarity_classifier/.gitignore index bee8a64..1bc943b 100644 --- a/post_process/similarity_classifier/.gitignore +++ b/post_process/similarity_classifier/.gitignore @@ -1 +1,3 @@ __pycache__ +exports +output.txt \ No newline at end of file diff --git a/post_process/similarity_classifier/output.txt b/post_process/similarity_classifier/output.txt index a0a0699..05d6c5d 100644 --- a/post_process/similarity_classifier/output.txt +++ b/post_process/similarity_classifier/output.txt @@ -1,31 +1,31 @@ Fold: 1 -Best threshold: 0.9775 -Accuracy: 0.92512 -F1 Score: 0.76313 -Precision: 0.78069 -Recall: 0.74633 +Best threshold: 0.9 +Accuracy: 0.89804 +F1 Score: 0.74986 +Precision: 0.62127 +Recall: 0.94558 Fold: 2 -Best threshold: 0.9775 -Accuracy: 0.92054 -F1 Score: 0.81117 -Precision: 0.77150 -Recall: 0.85514 +Best threshold: 0.9 +Accuracy: 0.86719 +F1 Score: 0.73213 +Precision: 0.61272 +Recall: 0.90935 Fold: 3 -Best threshold: 0.985 -Accuracy: 0.93201 -F1 Score: 0.83578 -Precision: 0.81657 -Recall: 0.85592 +Best threshold: 0.9 +Accuracy: 0.86941 +F1 Score: 0.74849 +Precision: 0.61280 +Recall: 0.96135 Fold: 4 -Best threshold: 0.9924999999999999 -Accuracy: 0.95334 -F1 Score: 0.82722 -Precision: 0.83341 -Recall: 0.82112 +Best threshold: 0.9 +Accuracy: 0.86325 +F1 Score: 0.65826 +Precision: 0.49865 +Recall: 0.96813 Fold: 5 -Best threshold: 0.9924999999999999 -Accuracy: 0.92968 -F1 Score: 0.77680 -Precision: 0.83395 -Recall: 0.72698 +Best threshold: 0.9 +Accuracy: 0.84147 +F1 Score: 0.66416 +Precision: 0.51612 +Recall: 0.93129 diff --git a/post_process/similarity_classifier/run.py b/post_process/similarity_classifier/run.py index 1f404ad..81562e5 100644 --- a/post_process/similarity_classifier/run.py +++ b/post_process/similarity_classifier/run.py @@ -110,27 +110,41 @@ def run_similarity_classifier(fold): sim_list.append(top_sim_value) # analysis 1: using threshold to perform find-back prediction success - threshold_values = np.linspace(0.85, 1.00, 21) # test 20 values, 21 to get nice round numbers - best_threshold = 0 - best_f1 = 0 - for threshold in threshold_values: - predict_list = [ elem > threshold for elem in sim_list ] + # threshold_values = np.linspace(0.85, 1.00, 21) # test 20 values, 21 to get nice round numbers + # best_threshold = 0 + # best_f1 = 0 + # for threshold in threshold_values: + # predict_list = [ elem > threshold for elem in sim_list ] - y_true = test_df['MDM'].to_list() - y_pred = predict_list + # y_true = test_df['MDM'].to_list() + # y_pred = predict_list - # Compute metrics - accuracy = accuracy_score(y_true, y_pred) - f1 = f1_score(y_true, y_pred) - precision = precision_score(y_true, y_pred) - recall = recall_score(y_true, y_pred) + # # Compute metrics + # accuracy = accuracy_score(y_true, y_pred) + # f1 = f1_score(y_true, y_pred) + # precision = precision_score(y_true, y_pred) + # recall = recall_score(y_true, y_pred) - if f1 > best_f1: - best_threshold = threshold - best_f1 = f1 + # if f1 > best_f1: + # best_threshold = threshold + # best_f1 = f1 + + # just manually set best_threshold + best_threshold = 0.90 # compute metrics again with best threshold predict_list = [ elem > best_threshold for elem in sim_list ] + + # save + pred_labels = np.array(predict_list, dtype=bool) + + # append the mdm prediction to the test_df for analysis later + df_out = pd.DataFrame({ + 'p_mdm': pred_labels, + }) + df_out.to_csv(f"exports/result_group_{fold}.csv", index=False) + + y_true = test_df['MDM'].to_list() y_pred = predict_list # Compute metrics diff --git a/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt b/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt index f892658..cbb850b 100644 --- a/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt +++ b/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt @@ -1,31 +1,31 @@ ******************************************************************************** Fold: 1 -Accuracy: 0.68859 -F1 Score: 0.62592 -Precision: 0.60775 -Recall: 0.68859 +Accuracy: 0.77142 +F1 Score: 0.70728 +Precision: 0.67509 +Recall: 0.77142 ******************************************************************************** Fold: 2 -Accuracy: 0.72150 -F1 Score: 0.65739 -Precision: 0.63652 -Recall: 0.72150 +Accuracy: 0.74065 +F1 Score: 0.68315 +Precision: 0.66680 +Recall: 0.74065 ******************************************************************************** Fold: 3 -Accuracy: 0.72038 -F1 Score: 0.65781 -Precision: 0.63249 -Recall: 0.72038 +Accuracy: 0.74849 +F1 Score: 0.68717 +Precision: 0.65975 +Recall: 0.74849 ******************************************************************************** Fold: 4 -Accuracy: 0.74167 -F1 Score: 0.68167 -Precision: 0.65489 -Recall: 0.74167 +Accuracy: 0.71836 +F1 Score: 0.65179 +Precision: 0.63155 +Recall: 0.71836 ******************************************************************************** Fold: 5 -Accuracy: 0.67705 -F1 Score: 0.61273 -Precision: 0.59472 -Recall: 0.67705 +Accuracy: 0.71461 +F1 Score: 0.65512 +Precision: 0.63375 +Recall: 0.71461 diff --git a/train/train.bash b/train/train.bash index 2ec766d..79bfe1b 100644 --- a/train/train.bash +++ b/train/train.bash @@ -1,12 +1,12 @@ #!/bin/bash -# cd classification_bert_complete_desc -# micromamba run -n hug accelerate launch train.py -# cd .. -# -# cd classification_bert_complete_desc_unit -# micromamba run -n hug accelerate launch train.py -# cd .. +cd classification_bert_complete_desc +micromamba run -n hug accelerate launch train.py +cd .. + +cd classification_bert_complete_desc_unit +micromamba run -n hug accelerate launch train.py +cd .. cd classification_bert_complete_desc_unit_name micromamba run -n hug accelerate launch train.py @@ -22,4 +22,4 @@ cd .. # # cd mapping_t5_complete_name_desc_unit # micromamba run -n hug accelerate launch train.py -# cd .. \ No newline at end of file +# cd ..