diff --git a/post_process/selection/.gitignore b/post_process/selection/.gitignore index bee8a64..d4660dc 100644 --- a/post_process/selection/.gitignore +++ b/post_process/selection/.gitignore @@ -1 +1,2 @@ __pycache__ +output.txt diff --git a/post_process/selection/output.txt b/post_process/selection/output.txt deleted file mode 100644 index 245955f..0000000 --- a/post_process/selection/output.txt +++ /dev/null @@ -1,56 +0,0 @@ - -******************************************************************************** -Statistics for fold 1 -tp: 1792 -tn: 10533 -fp: 428 -fn: 321 -fold: 1 -accuracy: 0.9427107235735047 -f1_score: 0.827140549273021 -precision: 0.8072072072072072 -recall: 0.8480832938949361 -******************************************************************************** -Statistics for fold 2 -tp: 1875 -tn: 8189 -fp: 393 -fn: 265 -fold: 2 -accuracy: 0.9386308524529006 -f1_score: 0.8507259528130672 -precision: 0.8267195767195767 -recall: 0.8761682242990654 -******************************************************************************** -Statistics for fold 3 -tp: 1831 -tn: 7455 -fp: 408 -fn: 161 -fold: 3 -accuracy: 0.9422628107559614 -f1_score: 0.8655164263767431 -precision: 0.8177757927646271 -recall: 0.9191767068273092 -******************************************************************************** -Statistics for fold 4 -tp: 1909 -tn: 12866 -fp: 483 -fn: 193 -fold: 4 -accuracy: 0.9562487864863116 -f1_score: 0.8495772140631954 -precision: 0.7980769230769231 -recall: 0.9081826831588963 -******************************************************************************** -Statistics for fold 5 -tp: 1928 -tn: 10359 -fp: 427 -fn: 255 -fold: 5 -accuracy: 0.9474130619168787 -f1_score: 0.8497135301895108 -precision: 0.818683651804671 -recall: 0.8831882730187814 diff --git a/post_process/selection/predict.py b/post_process/selection/predict.py index 55c88f3..c5ecbce 100644 --- a/post_process/selection/predict.py +++ b/post_process/selection/predict.py @@ -3,11 +3,11 @@ import os import glob # directory for checkpoints -checkpoint_directory = '../../train/baseline' +checkpoint_directory = '../../train/mapping_with_unit' def select(fold): # import test data - data_path = f"../../train/mapping/exports/result_group_{fold}.csv" + data_path = f"../../train/mapping_with_unit/mapping_prediction/exports/result_group_{fold}.csv" df = pd.read_csv(data_path, skipinitialspace=True) # get target data @@ -43,26 +43,6 @@ def select(fold): df = data_mapping - # we can save the t5 generation output here - # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet") - - - - # condition1 = df['MDM'] - # condition2 = df['p_MDM'] - - # condition_correct_thing = df['p_thing'] == df['thing'] - # condition_correct_property = df['p_property'] == df['property'] - # match = sum(condition1 & condition2) - # fn = sum(condition1 & ~condition2) - # prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1) - - # print("mdm match predicted mdm: ", match) # 56 - false negative - # print("mdm but not predicted mdm: ", fn) # 56 - false negative - # print("total mdm: ", sum(condition1)) # 2113 - # print("total predicted mdm: ", sum(condition2)) # 6896 - a lot of false positives - # print("correct mdm predicted", prediction_mdm_correct) - # selection ########################################### diff --git a/post_process/selection_with_pattern/run.py b/post_process/selection_with_pattern/run.py index f42985e..c5285fb 100644 --- a/post_process/selection_with_pattern/run.py +++ b/post_process/selection_with_pattern/run.py @@ -5,39 +5,18 @@ import glob from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix import numpy as np from utils import BertEmbedder, cosine_similarity_chunked +from fuzzywuzzy import fuzz +################## +# global parameters +DIAGNOSTIC = False +THRESHOLD = 0.85 +FUZZY_SIM_THRESHOLD=95 +checkpoint_directory = "../../train/classification_bert_desc" +################### # %% -# directory for checkpoints -checkpoint_directory = '../../train/mapping_pattern' - -fold = 5 -# import test data -data_path = f"../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv" -df = pd.read_csv(data_path, skipinitialspace=True) - -# get target data -data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" -train_df = pd.read_csv(data_path, skipinitialspace=True) -# processing to help with selection later - -# %% -df['p_pattern'] = df['p_thing'] + " " + df['p_property'] - -# %% -# obtain the full mdm_list -data_path = '../../data_import/exports/data_mapping_mdm.csv' -full_df = pd.read_csv(data_path, skipinitialspace=True) -full_mdm_pattern_list = sorted(list((set(full_df['pattern'])))) - -# %% -# we have to split into per-ship analysis -ships_list = sorted(list(set(df['ships_idx']))) -# %% -# for ship_idx in ships_list: -ship_idx = 1009 # choose an example ship -ship_df = df[df['ships_idx'] == ship_idx].reset_index(drop=True) - +# helper functions class Embedder(): input_df: pd.DataFrame fold: int @@ -65,101 +44,6 @@ class Embedder(): -# %% -data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" -train_df = pd.read_csv(data_path, skipinitialspace=True) - -checkpoint_directory = "../../train/classification_bert" -directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') -# Use glob to find matching paths -# path is usually checkpoint_fold_1/checkpoint- -# we are guaranteed to save only 1 checkpoint from training -pattern = 'checkpoint-*' -checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] - -train_embedder = Embedder(input_df=train_df) -train_embeds = train_embedder.make_embedding(checkpoint_path) - -test_embedder = Embedder(input_df=ship_df) -test_embeds = test_embedder.make_embedding(checkpoint_path) - - - -# %% -# test embeds are inputs since we are looking back at train data -cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() - - - -# The general idea: -# step 1: keep only pattern generations that belong to mdm list -# -> this removes totally wrong datasets that mapped to totally wrong things -# step 2: loop through the mdm list and isolate data in both train and test that -# belong to the same pattern class -# -> this is more tricky, because we have non-mdm mapping to correct classes -# -> so we have to find which candidate is most similar to the training data - -# it is very tricky to keep track of classification across multiple stages so we -# will use a boolean answer list - -# %% -answer_list = np.ones(len(ship_df), dtype=bool) - -########################################## -# %% -# STEP 1 -# we want to loop through the the ship_df and find which ones match our full_mdm_list -pattern_match_mask = ship_df['p_pattern'].apply(lambda x: x in full_mdm_pattern_list).to_numpy() -# we assign only those that are False to our answer list -# right now the 2 arrays are basically equal -answer_list[~pattern_match_mask] = False - -# %% TEMP -print('proportion belonging to mdm classes', sum(pattern_match_mask)/len(pattern_match_mask)) - -# %% TEMP -y_true = ship_df['MDM'].to_list() -y_pred = pattern_match_mask - -# Compute metrics -accuracy = accuracy_score(y_true, y_pred) -print(f'Accuracy: {accuracy:.5f}') - -# we can see that the accuracy is not good -# %% -######################################### -# STEP 2 -# we want to go through each mdm class label -# but we do not want to make subsets of dataframes -# we will make heavy use of boolean masks - -# we want to identify per-ship mdm classes -ship_mdm_classes = sorted(set(ship_df['p_pattern'][pattern_match_mask].to_list())) - -# %% -len(ship_mdm_classes) - -# %% -for idx,select_class in enumerate(ship_mdm_classes): - print(idx, len(ship_df[ship_df['p_pattern'] == select_class])) - -# %% -select_class = ship_mdm_classes[22] -sample_df = ship_df[ship_df['p_pattern'] == select_class] - -# %% -# we need to set all idx of chosen entries as False in answer_list -selected_idx_list = sample_df.index.to_list() -answer_list[selected_idx_list] = False - -# %% -# because we have variants of a tag_description, we cannot choose 1 from the -# given candidates we have to first group the candidates, and then choose which -# group is most similar - -# %% -from fuzzywuzzy import fuzz - # the purpose of this function is to group the strings that are similar to each other # we need to form related groups of inputs def group_similar_strings(obj_list, threshold=80): @@ -170,29 +54,16 @@ def group_similar_strings(obj_list, threshold=80): # tuple is (idx, string) if obj in processed_strings: continue - # Find all strings similar to the current string above the threshold similar_strings = [s for s in obj_list if s[1] != obj[1] and fuzz.ratio(obj[1], s[1]) >= threshold] - # Add the original string to the similar group similar_group = [obj] + similar_strings - # Mark all similar strings as processed processed_strings.update(similar_group) - # Add the group to the list of groups groups.append(similar_group) - return groups -# Example usage -string_list = sample_df['tag_description'].to_list() -index_list = sample_df.index.to_list() -obj_list = list(zip(index_list, string_list)) -groups = group_similar_strings(obj_list, threshold=90) -print(groups) - -# %% # this function takes in groups of related terms and create candidate entries def make_candidates(groups): candidates = [] @@ -203,21 +74,6 @@ def make_candidates(groups): candidates.append(id_of_tuple) return candidates -# %% -test_candidates = make_candidates(groups) -test_candidates_mask = np.zeros(len(ship_df), dtype=bool) -test_candidates_mask[test_candidates] = True - -# %% -train_candidates_mask = (train_df['pattern'] == select_class).to_numpy() - -# %% -# we need to make the cos_sim_matrix -# for that, we need to generate the embeddings of the ship_df (test embedding) -# and the train_df (train embeddin) - -# we then use the selection function using the given mask to choose the most -# appropriate candidate # the selection function takes in the full cos_sim_matrix then subsets the # matrix according to the test_candidates_mask and train_candidates_mask that we @@ -240,22 +96,212 @@ def selection(cos_sim_matrix, source_mask, target_mask): y_scores = np.mean(top_k_values, axis=1) max_idx = np.argmax(y_scores) max_score = y_scores[max_idx] + return max_idx, max_score + +#################### +# global level # %% -max_idx, max_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask) +# obtain the full mdm_list +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +full_mdm_pattern_list = sorted(list((set(full_df['pattern'])))) + + +##################### +# fold level + +def run_selection(fold): + + # set the fold + # import test data + data_path = f"../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + df['p_pattern'] = df['p_thing'] + " " + df['p_property'] + + # get target data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # generate your embeddings + # checkpoint_directory defined at global level + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + # we can generate the train embeddings once and re-use for every ship + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + + # create global_answer array + # the purpose of this array is to track the classification state at the global + # level + global_answer = np.zeros(len(df), dtype=bool) + + + ############################# + # ship level + # we have to split into per-ship analysis + ships_list = sorted(list(set(df['ships_idx']))) + for ship_idx in ships_list: + # ship_idx = 1001 # choose an example ship + + ship_df = df[df['ships_idx'] == ship_idx] + # required to map local ship_answer array to global_answer array + map_local_index_to_global_index = ship_df.index.to_numpy() + ship_df = df[df['ships_idx'] == ship_idx].reset_index(drop=True) + + # generate new embeddings for each ship + test_embedder = Embedder(input_df=ship_df) + test_embeds = test_embedder.make_embedding(checkpoint_path) + + # generate the cosine sim matrix + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() + + ############################## + # selection level + # The general idea: + # step 1: keep only pattern generations that belong to mdm list + # -> this removes totally wrong datasets that mapped to totally wrong things + # step 2: loop through the mdm list and isolate data in both train and test that + # belong to the same pattern class + # -> this is more tricky, because we have non-mdm mapping to correct classes + # -> so we have to find which candidate is most similar to the training data + + # it is very tricky to keep track of classification across multiple stages so we + # will use a boolean answer list + + # initialize the local answer list + ship_answer_list = np.ones(len(ship_df), dtype=bool) + + ########### + # STEP 1 + # we want to loop through the generated class labels and find which ones match + # our pattern list + + pattern_match_mask = ship_df['p_pattern'].apply(lambda x: x in full_mdm_pattern_list).to_numpy() + # we assign only those that are False to our answer list + # right now the 2 arrays are basically equal + ship_answer_list[~pattern_match_mask] = False + + ########### + # STEP 2 + # we now go through each class found in our generated set + + # we want to identify per-ship mdm classes + ship_predicted_classes = sorted(set(ship_df['p_pattern'][pattern_match_mask].to_list())) + + # this function performs the selection given a class + # it takes in the cos_sim_matrix + # it returns the selection by mutating the answer_list + # it sets all relevant idxs to False initially, then sets the selected values to True + def selection_for_class(select_class, cos_sim_matrix, answer_list): + + # separate the global variable from function variable + answer_list = answer_list.copy() + sample_df = ship_df[ship_df['p_pattern'] == select_class] + + # we need to set all idx of chosen entries as False in answer_list + selected_idx_list = sample_df.index.to_list() + answer_list[selected_idx_list] = False + + # basic assumption check + + # group related inputs by description similarity + string_list = sample_df['tag_description'].to_list() + index_list = sample_df.index.to_list() + obj_list = list(zip(index_list, string_list)) + # groups is a list of list, where each list is composed of a + # (idx, string) tuple + groups = group_similar_strings(obj_list, threshold=FUZZY_SIM_THRESHOLD) + + # generate the masking arrays for both test and train embeddings + # we select a tuple from each group, and use that as a candidate for selection + test_candidates = make_candidates(groups) + test_candidates_mask = np.zeros(len(ship_df), dtype=bool) + test_candidates_mask[test_candidates] = True + # we make candidates to compare against in the data sharing the same class + train_candidates_mask = (train_df['pattern'] == select_class).to_numpy() + + # perform selection + # it returns the group index that is most likely + max_idx, max_score = selection(cos_sim_matrix, test_candidates_mask, train_candidates_mask) + + # consolidate all idx's in the same group + chosen_group = groups[max_idx] + chosen_idx_list = [tuple[0] for tuple in chosen_group] + + + # before doing this, we have to use the max_score and evaluate if its close enough + if max_score > THRESHOLD: + answer_list[chosen_idx_list] = True + + return answer_list + + + # we choose one mdm class + for select_class in ship_predicted_classes: + ship_answer_list = selection_for_class(select_class, cos_sim_matrix, ship_answer_list) + + # we want to write back to global_answer + # first we convert local indices to global indices + local_indices = np.where(ship_answer_list)[0] + global_indices = map_local_index_to_global_index[local_indices] + global_answer[global_indices] = True + + + if DIAGNOSTIC: + # evaluation at per-ship level + y_true = ship_df['MDM'].to_list() + y_pred = ship_answer_list + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + + + + y_true = df['MDM'].to_list() + y_pred = global_answer + + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + print(f"tp: {tp}") + print(f"tn: {tn}") + print(f"fp: {fp}") + print(f"fn: {fn}") + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + # %% -# after obtaining best group, we set all candidates of the group as True -chosen_group = groups[max_idx] -chosen_idx = [tuple[0] for tuple in chosen_group] - -# %% -# before doing this, we have to use the max_score and evaluate if its close enough -THRESHOLD = 0.8 -if max_score > THRESHOLD: - answer_list[chosen_idx] = True +for fold in [1,2,3,4,5]: + print(f'Perform selection for fold {fold}') + run_selection(fold) + # %% diff --git a/post_process/selection_with_pattern/utils.py b/post_process/selection_with_pattern/utils.py index 3205655..39f1b16 100644 --- a/post_process/selection_with_pattern/utils.py +++ b/post_process/selection_with_pattern/utils.py @@ -30,7 +30,7 @@ class BertEmbedder: for i in range(0, len(input_texts), batch_size): batch_texts = input_texts[i:i+batch_size] # Tokenize the input text - inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64) + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=120) input_ids = inputs.input_ids.to(self.device) attention_mask = inputs.attention_mask.to(self.device)