hipom_data_mapping/post_process/selection/selection.py

import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm
from utils import Retriever, cosine_similarity_chunked

class Selector():
    input_df: pd.DataFrame
    reference_df: pd.DataFrame
    ships_list: List[int]
    fold: int

    def __init__(self, input_df, reference_df, fold):
        self.ships_list = sorted(list(set(input_df['ships_idx'])))
        self.input_df = input_df
        self.reference_df = reference_df
        self.fold = fold


    def run_selection(self, checkpoint_path):

        def generate_input_list(df):
            input_list = []
            for _, row in df.iterrows():
                # name = f"<NAME>{row['tag_name']}<NAME>"
                desc = f"<DESC>{row['tag_description']}<DESC>"
                # element = f"{name}{desc}"
                element = f"{desc}"
                input_list.append(element)
            return input_list

        # given a dataframe, return a single idx of the entry has the highest match with
        # the embedding
        def selection(cos_sim_matrix, condition_source, condition_target):
            # subset_matrix = cos_sim_matrix[condition_source]
            # except we are subsetting 2D matrix (row, column)
            subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
            # we select top k here
            # Get the indices of the top 5 maximum values along axis 1
            top_k = 1
            top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
            
            # Get the values of the top 5 maximum scores
            top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
            
            # Calculate the average of the top 5 scores along axis 1
            y_scores = np.mean(top_k_values, axis=1)
            max_idx = np.argmax(y_scores)
            max_score = y_scores[max_idx]
            # convert boolean to indices (1,2,3)
            condition_indices = np.where(condition_source)[0]
            max_idx = condition_indices[max_idx]

            return max_idx, max_score


        # prepare reference embed
        train_data = list(generate_input_list(self.reference_df))
        # Define the directory and the pattern
        retriever_train = Retriever(train_data, checkpoint_path)
        retriever_train.make_mean_embedding(batch_size=64)
        train_embed = retriever_train.embeddings

        # take the inputs for df_sub
        test_data = list(generate_input_list(self.input_df))
        retriever_test = Retriever(test_data, checkpoint_path)
        retriever_test.make_mean_embedding(batch_size=64)
        test_embed = retriever_test.embeddings


        # precision_list = []
        # recall_list = []
        tp_accumulate = 0
        tn_accumulate = 0
        fp_accumulate = 0
        fn_accumulate = 0
        THRESHOLD = 0.9
        for ship_idx in self.ships_list:
            print(ship_idx)
            # we select a ship and select only data exhibiting MDM pattern in the predictions
            ship_mask = (self.input_df['ships_idx'] == ship_idx) & (self.input_df['p_MDM'])
            df_ship = self.input_df[ship_mask].reset_index(drop=True)
            # we then try to make a dataframe for each thing_property attribute
            df_ship['thing_property'] = df_ship['p_thing'] + " " + df_ship['p_property']
            unique_patterns = list(set(df_ship['thing_property']))
            condition_list = []
            for pattern in unique_patterns:
                # we obtain the boolean mask to subset the source and target entries
                condition_source = (df_ship['thing_property'] == pattern)
                condition_target = (self.reference_df['thing_property'] == pattern)
                item = {'condition_source': condition_source,
                        'condition_target': condition_target}
                condition_list.append(item)

            # subset part of self.input_df that belongs to the ship 
            test_embed_subset = test_embed[ship_mask]
            cos_sim_matrix = cosine_similarity_chunked(test_embed_subset, train_embed, chunk_size=8).cpu().numpy()


            # for each sub_df, we have to select the best candidate
            # we will do this by finding which desc input has the highest similarity with train data
            all_idx_list = []
            selected_idx_list = []
            similarity_score = []
            for item in tqdm(condition_list):
                condition_source = item['condition_source']
                condition_target = item['condition_target']
                # if there is no equivalent data in target, we skip
                if sum(condition_target) == 0:
                    pass
                # if there is equivalent data in target, we perform selection among source
                # by top-k highest similarity with targets
                else:
                    # idx is with respect
                    max_idx, max_score = selection(
                        cos_sim_matrix, condition_source, condition_target 
                    )
                    all_idx_list.append(max_idx)
                    similarity_score.append(max_score)
                    # implement thresholding
                    if max_score > THRESHOLD:
                        selected_idx_list.append(max_idx)

            # let us tag the df_ship with the respective 'selected' and 'ood' tags
            df_ship['selected'] = False
            df_ship.loc[all_idx_list, 'selected'] = True
            df_ship['ood'] = 0.0
            df_ship.loc[all_idx_list, 'ood'] = similarity_score

            # we now split the dataframe by p_mdm
            # explanation:
            # we first separated our ship into p_mdm and non p_mdm
            # we only select final in-mdm prediction from p_mdm subset
            # anything that is not selected and from non-p_mdm is predicted not in mdm

            # get our final prediction
            df_subset_predicted_true = df_ship.loc[selected_idx_list]
            # take the set difference between df_ship's index and the given list
            inverse_list = df_ship.index.difference(selected_idx_list).to_list()
            df_subset_predicted_false = df_ship.loc[inverse_list]

            not_p_mdm_mask = (self.input_df['ships_idx'] == ship_idx) & (~self.input_df['p_MDM'])
            # this is the part we don't care
            df_not_p_mdm = self.input_df[not_p_mdm_mask].reset_index(drop=True)

            # concat
            df_false = pd.concat([df_subset_predicted_false, df_not_p_mdm], axis=0)
            assert(len(df_false) + len(df_subset_predicted_true) == sum(self.input_df['ships_idx'] == ship_idx))

            # we want to return a df with the final prediction
            # a bit dirty, but we re-use the fields
            df_false['p_MDM'] = False
            df_subset_predicted_true['p_MDM'] = True


            # save ship for analysis later
            # df_return = pd.concat([df_false, df_subset_predicted_true], axis=0)
            # df_return.to_parquet(f'exports/fold_{self.fold}/ship_{ship_idx}.parquet')


            # true positive -> predicted in mdm, actual in mdm
            # we get all the final predictions that are also found in MDM
            true_positive = sum(df_subset_predicted_true['MDM'])
            # true negative -> predicted not in mdm, and not found in MDM
            # we negate the condition to get those that are not found in MDM
            true_negative = sum(~df_false['MDM'])
            # false positive -> predicted in mdm, not found in mdm
            false_positive = sum(~df_subset_predicted_true['MDM'])
            # false negative -> predicted not in mdm, found in mdm
            false_negative = sum(df_false['MDM'])


            tp_accumulate = tp_accumulate + true_positive
            tn_accumulate = tn_accumulate + true_negative
            fp_accumulate = fp_accumulate + false_positive
            fn_accumulate = fn_accumulate + false_negative

             
        total_sum = (tp_accumulate + tn_accumulate + fp_accumulate + fn_accumulate)
        # ensure that all entries are accounted for
        assert(total_sum == len(self.input_df))
        return tp_accumulate, tn_accumulate, fp_accumulate, fn_accumulate
Feat: added train and test directories 2024-10-31 15:58:20 +09:00			`import pandas as pd`
			`import numpy as np`
			`from typing import List`
			`from tqdm import tqdm`
			`from utils import Retriever, cosine_similarity_chunked`

			`class Selector():`
			`input_df: pd.DataFrame`
			`reference_df: pd.DataFrame`
			`ships_list: List[int]`
			`fold: int`

			`def __init__(self, input_df, reference_df, fold):`
			`self.ships_list = sorted(list(set(input_df['ships_idx'])))`
			`self.input_df = input_df`
			`self.reference_df = reference_df`
			`self.fold = fold`


			`def run_selection(self, checkpoint_path):`

			`def generate_input_list(df):`
			`input_list = []`
			`for _, row in df.iterrows():`
			`# name = f"<NAME>{row['tag_name']}<NAME>"`
			`desc = f"<DESC>{row['tag_description']}<DESC>"`
			`# element = f"{name}{desc}"`
			`element = f"{desc}"`
			`input_list.append(element)`
			`return input_list`

			`# given a dataframe, return a single idx of the entry has the highest match with`
			`# the embedding`
			`def selection(cos_sim_matrix, condition_source, condition_target):`
			`# subset_matrix = cos_sim_matrix[condition_source]`
			`# except we are subsetting 2D matrix (row, column)`
			`subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]`
			`# we select top k here`
			`# Get the indices of the top 5 maximum values along axis 1`
			`top_k = 1`
			`top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values`

			`# Get the values of the top 5 maximum scores`
			`top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)`

			`# Calculate the average of the top 5 scores along axis 1`
			`y_scores = np.mean(top_k_values, axis=1)`
			`max_idx = np.argmax(y_scores)`
			`max_score = y_scores[max_idx]`
			`# convert boolean to indices (1,2,3)`
			`condition_indices = np.where(condition_source)[0]`
			`max_idx = condition_indices[max_idx]`

			`return max_idx, max_score`


			`# prepare reference embed`
			`train_data = list(generate_input_list(self.reference_df))`
			`# Define the directory and the pattern`
			`retriever_train = Retriever(train_data, checkpoint_path)`
			`retriever_train.make_mean_embedding(batch_size=64)`
			`train_embed = retriever_train.embeddings`

			`# take the inputs for df_sub`
			`test_data = list(generate_input_list(self.input_df))`
			`retriever_test = Retriever(test_data, checkpoint_path)`
			`retriever_test.make_mean_embedding(batch_size=64)`
			`test_embed = retriever_test.embeddings`



			`# precision_list = []`
			`# recall_list = []`
			`tp_accumulate = 0`
			`tn_accumulate = 0`
			`fp_accumulate = 0`
			`fn_accumulate = 0`
			`THRESHOLD = 0.9`
			`for ship_idx in self.ships_list:`
			`print(ship_idx)`
			`# we select a ship and select only data exhibiting MDM pattern in the predictions`
			`ship_mask = (self.input_df['ships_idx'] == ship_idx) & (self.input_df['p_MDM'])`
			`df_ship = self.input_df[ship_mask].reset_index(drop=True)`
			`# we then try to make a dataframe for each thing_property attribute`
			`df_ship['thing_property'] = df_ship['p_thing'] + " " + df_ship['p_property']`
			`unique_patterns = list(set(df_ship['thing_property']))`
			`condition_list = []`
			`for pattern in unique_patterns:`
			`# we obtain the boolean mask to subset the source and target entries`
			`condition_source = (df_ship['thing_property'] == pattern)`
			`condition_target = (self.reference_df['thing_property'] == pattern)`
			`item = {'condition_source': condition_source,`
			`'condition_target': condition_target}`
			`condition_list.append(item)`

			`# subset part of self.input_df that belongs to the ship`
			`test_embed_subset = test_embed[ship_mask]`
			`cos_sim_matrix = cosine_similarity_chunked(test_embed_subset, train_embed, chunk_size=8).cpu().numpy()`


			`# for each sub_df, we have to select the best candidate`
			`# we will do this by finding which desc input has the highest similarity with train data`
			`all_idx_list = []`
			`selected_idx_list = []`
			`similarity_score = []`
			`for item in tqdm(condition_list):`
			`condition_source = item['condition_source']`
			`condition_target = item['condition_target']`
			`# if there is no equivalent data in target, we skip`
			`if sum(condition_target) == 0:`
			`pass`
			`# if there is equivalent data in target, we perform selection among source`
			`# by top-k highest similarity with targets`
			`else:`
			`# idx is with respect`
			`max_idx, max_score = selection(`
			`cos_sim_matrix, condition_source, condition_target`
			`)`
			`all_idx_list.append(max_idx)`
			`similarity_score.append(max_score)`
			`# implement thresholding`
			`if max_score > THRESHOLD:`
			`selected_idx_list.append(max_idx)`

			`# let us tag the df_ship with the respective 'selected' and 'ood' tags`
			`df_ship['selected'] = False`
			`df_ship.loc[all_idx_list, 'selected'] = True`
			`df_ship['ood'] = 0.0`
			`df_ship.loc[all_idx_list, 'ood'] = similarity_score`

			`# we now split the dataframe by p_mdm`
			`# explanation:`
			`# we first separated our ship into p_mdm and non p_mdm`
			`# we only select final in-mdm prediction from p_mdm subset`
			`# anything that is not selected and from non-p_mdm is predicted not in mdm`

			`# get our final prediction`
			`df_subset_predicted_true = df_ship.loc[selected_idx_list]`
			`# take the set difference between df_ship's index and the given list`
			`inverse_list = df_ship.index.difference(selected_idx_list).to_list()`
			`df_subset_predicted_false = df_ship.loc[inverse_list]`

			`not_p_mdm_mask = (self.input_df['ships_idx'] == ship_idx) & (~self.input_df['p_MDM'])`
			`# this is the part we don't care`
			`df_not_p_mdm = self.input_df[not_p_mdm_mask].reset_index(drop=True)`

			`# concat`
			`df_false = pd.concat([df_subset_predicted_false, df_not_p_mdm], axis=0)`
			`assert(len(df_false) + len(df_subset_predicted_true) == sum(self.input_df['ships_idx'] == ship_idx))`

			`# we want to return a df with the final prediction`
			`# a bit dirty, but we re-use the fields`
			`df_false['p_MDM'] = False`
			`df_subset_predicted_true['p_MDM'] = True`


			`# save ship for analysis later`
			`# df_return = pd.concat([df_false, df_subset_predicted_true], axis=0)`
			`# df_return.to_parquet(f'exports/fold_{self.fold}/ship_{ship_idx}.parquet')`




			`# true positive -> predicted in mdm, actual in mdm`
			`# we get all the final predictions that are also found in MDM`
			`true_positive = sum(df_subset_predicted_true['MDM'])`
			`# true negative -> predicted not in mdm, and not found in MDM`
			`# we negate the condition to get those that are not found in MDM`
			`true_negative = sum(~df_false['MDM'])`
			`# false positive -> predicted in mdm, not found in mdm`
			`false_positive = sum(~df_subset_predicted_true['MDM'])`
			`# false negative -> predicted not in mdm, found in mdm`
			`false_negative = sum(df_false['MDM'])`


			`tp_accumulate = tp_accumulate + true_positive`
			`tn_accumulate = tn_accumulate + true_negative`
			`fp_accumulate = fp_accumulate + false_positive`
			`fn_accumulate = fn_accumulate + false_negative`



			`total_sum = (tp_accumulate + tn_accumulate + fp_accumulate + fn_accumulate)`
			`# ensure that all entries are accounted for`
			`assert(total_sum == len(self.input_df))`
			`return tp_accumulate, tn_accumulate, fp_accumulate, fn_accumulate`