import pandas as pd import numpy as np from typing import List from tqdm import tqdm from utils import Retriever, cosine_similarity_chunked # global parameters THRESHOLD = 0.95 BATCH_SIZE = 512 # class Selector(): input_df: pd.DataFrame reference_df: pd.DataFrame ships_list: List[int] fold: int def __init__(self, input_df, reference_df, fold): self.ships_list = sorted(list(set(input_df['ships_idx']))) self.input_df = input_df self.reference_df = reference_df self.fold = fold def run_selection(self, checkpoint_path): def generate_input_list(df): input_list = [] for _, row in df.iterrows(): name = f"{row['tag_name']}" desc = f"{row['tag_description']}" unit = f"{row['unit']}" element = f"{name}{desc}{unit}" input_list.append(element) return input_list # given a dataframe, return a single idx of the entry has the highest match with # the embedding def selection(cos_sim_matrix, condition_source, condition_target): # subset_matrix = cos_sim_matrix[condition_source] # except we are subsetting 2D matrix (row, column) subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] # we select top k here # Get the indices of the top 5 maximum values along axis 1 top_k = 1 top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values # Get the values of the top 5 maximum scores top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) # Calculate the average of the top 5 scores along axis 1 y_scores = np.mean(top_k_values, axis=1) max_idx = np.argmax(y_scores) max_score = y_scores[max_idx] # convert boolean to indices (1,2,3) condition_indices = np.where(condition_source)[0] max_idx = condition_indices[max_idx] return max_idx, max_score # prepare reference embed train_data = list(generate_input_list(self.reference_df)) # Define the directory and the pattern retriever_train = Retriever(train_data, checkpoint_path) retriever_train.make_mean_embedding(batch_size=BATCH_SIZE) train_embed = retriever_train.embeddings # take the inputs for df_sub test_data = list(generate_input_list(self.input_df)) retriever_test = Retriever(test_data, checkpoint_path) retriever_test.make_mean_embedding(batch_size=BATCH_SIZE) test_embed = retriever_test.embeddings # precision_list = [] # recall_list = [] tp_accumulate = 0 tn_accumulate = 0 fp_accumulate = 0 fn_accumulate = 0 for ship_idx in self.ships_list: print(ship_idx) # we select a ship and select only data exhibiting MDM pattern in the predictions ship_mask = (self.input_df['ships_idx'] == ship_idx) & (self.input_df['p_MDM']) df_ship = self.input_df[ship_mask].reset_index(drop=True) # we then try to make a dataframe for each thing_property attribute df_ship['thing_property'] = df_ship['p_thing'] + " " + df_ship['p_property'] unique_patterns = list(set(df_ship['thing_property'])) condition_list = [] for pattern in unique_patterns: # we obtain the boolean mask to subset the source and target entries condition_source = (df_ship['thing_property'] == pattern) condition_target = (self.reference_df['thing_property'] == pattern) item = {'condition_source': condition_source, 'condition_target': condition_target} condition_list.append(item) # subset part of self.input_df that belongs to the ship test_embed_subset = test_embed[ship_mask] cos_sim_matrix = cosine_similarity_chunked(test_embed_subset, train_embed, chunk_size=8).cpu().numpy() # for each sub_df, we have to select the best candidate # we will do this by finding which desc input has the highest similarity with train data all_idx_list = [] selected_idx_list = [] similarity_score = [] for item in tqdm(condition_list): condition_source = item['condition_source'] condition_target = item['condition_target'] # if there is no equivalent data in target, we skip if sum(condition_target) == 0: pass # if there is equivalent data in target, we perform selection among source # by top-k highest similarity with targets else: # idx is with respect max_idx, max_score = selection( cos_sim_matrix, condition_source, condition_target ) all_idx_list.append(max_idx) similarity_score.append(max_score) # implement thresholding print(max_score) if max_score > THRESHOLD: selected_idx_list.append(max_idx) # let us tag the df_ship with the respective 'selected' and 'ood' tags df_ship['selected'] = False df_ship.loc[all_idx_list, 'selected'] = True df_ship['ood'] = 0.0 df_ship.loc[all_idx_list, 'ood'] = similarity_score # we now split the dataframe by p_mdm # explanation: # we first separated our ship into p_mdm and non p_mdm # we only select final in-mdm prediction from p_mdm subset # anything that is not selected and from non-p_mdm is predicted not in mdm # get our final prediction df_subset_predicted_true = df_ship.loc[selected_idx_list] # take the set difference between df_ship's index and the given list inverse_list = df_ship.index.difference(selected_idx_list).to_list() df_subset_predicted_false = df_ship.loc[inverse_list] not_p_mdm_mask = (self.input_df['ships_idx'] == ship_idx) & (~self.input_df['p_MDM']) # this is the part we don't care df_not_p_mdm = self.input_df[not_p_mdm_mask].reset_index(drop=True) # concat df_false = pd.concat([df_subset_predicted_false, df_not_p_mdm], axis=0) assert(len(df_false) + len(df_subset_predicted_true) == sum(self.input_df['ships_idx'] == ship_idx)) # we want to return a df with the final prediction # a bit dirty, but we re-use the fields df_false['p_MDM'] = False df_subset_predicted_true['p_MDM'] = True # save ship for analysis later # df_return = pd.concat([df_false, df_subset_predicted_true], axis=0) # df_return.to_parquet(f'exports/fold_{self.fold}/ship_{ship_idx}.parquet') # true positive -> predicted in mdm, actual in mdm # we get all the final predictions that are also found in MDM true_positive = sum(df_subset_predicted_true['MDM']) # true negative -> predicted not in mdm, and not found in MDM # we negate the condition to get those that are not found in MDM true_negative = sum(~df_false['MDM']) # false positive -> predicted in mdm, not found in mdm false_positive = sum(~df_subset_predicted_true['MDM']) # false negative -> predicted not in mdm, found in mdm false_negative = sum(df_false['MDM']) tp_accumulate = tp_accumulate + true_positive tn_accumulate = tn_accumulate + true_negative fp_accumulate = fp_accumulate + false_positive fn_accumulate = fn_accumulate + false_negative total_sum = (tp_accumulate + tn_accumulate + fp_accumulate + fn_accumulate) # ensure that all entries are accounted for assert(total_sum == len(self.input_df)) return tp_accumulate, tn_accumulate, fp_accumulate, fn_accumulate