hipom_data_mapping/post_process/selection/selection.py

188 lines
8.1 KiB
Python
Raw Normal View History

2024-10-31 15:58:20 +09:00
import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm
from utils import Retriever, cosine_similarity_chunked
class Selector():
input_df: pd.DataFrame
reference_df: pd.DataFrame
ships_list: List[int]
fold: int
def __init__(self, input_df, reference_df, fold):
self.ships_list = sorted(list(set(input_df['ships_idx'])))
self.input_df = input_df
self.reference_df = reference_df
self.fold = fold
def run_selection(self, checkpoint_path):
def generate_input_list(df):
input_list = []
for _, row in df.iterrows():
# name = f"<NAME>{row['tag_name']}<NAME>"
desc = f"<DESC>{row['tag_description']}<DESC>"
# element = f"{name}{desc}"
element = f"{desc}"
input_list.append(element)
return input_list
# given a dataframe, return a single idx of the entry has the highest match with
# the embedding
def selection(cos_sim_matrix, condition_source, condition_target):
# subset_matrix = cos_sim_matrix[condition_source]
# except we are subsetting 2D matrix (row, column)
subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
# we select top k here
# Get the indices of the top 5 maximum values along axis 1
top_k = 1
top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values
# Get the values of the top 5 maximum scores
top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
# Calculate the average of the top 5 scores along axis 1
y_scores = np.mean(top_k_values, axis=1)
max_idx = np.argmax(y_scores)
max_score = y_scores[max_idx]
# convert boolean to indices (1,2,3)
condition_indices = np.where(condition_source)[0]
max_idx = condition_indices[max_idx]
return max_idx, max_score
# prepare reference embed
train_data = list(generate_input_list(self.reference_df))
# Define the directory and the pattern
retriever_train = Retriever(train_data, checkpoint_path)
retriever_train.make_mean_embedding(batch_size=64)
train_embed = retriever_train.embeddings
# take the inputs for df_sub
test_data = list(generate_input_list(self.input_df))
retriever_test = Retriever(test_data, checkpoint_path)
retriever_test.make_mean_embedding(batch_size=64)
test_embed = retriever_test.embeddings
# precision_list = []
# recall_list = []
tp_accumulate = 0
tn_accumulate = 0
fp_accumulate = 0
fn_accumulate = 0
THRESHOLD = 0.9
for ship_idx in self.ships_list:
print(ship_idx)
# we select a ship and select only data exhibiting MDM pattern in the predictions
ship_mask = (self.input_df['ships_idx'] == ship_idx) & (self.input_df['p_MDM'])
df_ship = self.input_df[ship_mask].reset_index(drop=True)
# we then try to make a dataframe for each thing_property attribute
df_ship['thing_property'] = df_ship['p_thing'] + " " + df_ship['p_property']
unique_patterns = list(set(df_ship['thing_property']))
condition_list = []
for pattern in unique_patterns:
# we obtain the boolean mask to subset the source and target entries
condition_source = (df_ship['thing_property'] == pattern)
condition_target = (self.reference_df['thing_property'] == pattern)
item = {'condition_source': condition_source,
'condition_target': condition_target}
condition_list.append(item)
# subset part of self.input_df that belongs to the ship
test_embed_subset = test_embed[ship_mask]
cos_sim_matrix = cosine_similarity_chunked(test_embed_subset, train_embed, chunk_size=8).cpu().numpy()
# for each sub_df, we have to select the best candidate
# we will do this by finding which desc input has the highest similarity with train data
all_idx_list = []
selected_idx_list = []
similarity_score = []
for item in tqdm(condition_list):
condition_source = item['condition_source']
condition_target = item['condition_target']
# if there is no equivalent data in target, we skip
if sum(condition_target) == 0:
pass
# if there is equivalent data in target, we perform selection among source
# by top-k highest similarity with targets
else:
# idx is with respect
max_idx, max_score = selection(
cos_sim_matrix, condition_source, condition_target
)
all_idx_list.append(max_idx)
similarity_score.append(max_score)
# implement thresholding
if max_score > THRESHOLD:
selected_idx_list.append(max_idx)
# let us tag the df_ship with the respective 'selected' and 'ood' tags
df_ship['selected'] = False
df_ship.loc[all_idx_list, 'selected'] = True
df_ship['ood'] = 0.0
df_ship.loc[all_idx_list, 'ood'] = similarity_score
# we now split the dataframe by p_mdm
# explanation:
# we first separated our ship into p_mdm and non p_mdm
# we only select final in-mdm prediction from p_mdm subset
# anything that is not selected and from non-p_mdm is predicted not in mdm
# get our final prediction
df_subset_predicted_true = df_ship.loc[selected_idx_list]
# take the set difference between df_ship's index and the given list
inverse_list = df_ship.index.difference(selected_idx_list).to_list()
df_subset_predicted_false = df_ship.loc[inverse_list]
not_p_mdm_mask = (self.input_df['ships_idx'] == ship_idx) & (~self.input_df['p_MDM'])
# this is the part we don't care
df_not_p_mdm = self.input_df[not_p_mdm_mask].reset_index(drop=True)
# concat
df_false = pd.concat([df_subset_predicted_false, df_not_p_mdm], axis=0)
assert(len(df_false) + len(df_subset_predicted_true) == sum(self.input_df['ships_idx'] == ship_idx))
# we want to return a df with the final prediction
# a bit dirty, but we re-use the fields
df_false['p_MDM'] = False
df_subset_predicted_true['p_MDM'] = True
# save ship for analysis later
# df_return = pd.concat([df_false, df_subset_predicted_true], axis=0)
# df_return.to_parquet(f'exports/fold_{self.fold}/ship_{ship_idx}.parquet')
# true positive -> predicted in mdm, actual in mdm
# we get all the final predictions that are also found in MDM
true_positive = sum(df_subset_predicted_true['MDM'])
# true negative -> predicted not in mdm, and not found in MDM
# we negate the condition to get those that are not found in MDM
true_negative = sum(~df_false['MDM'])
# false positive -> predicted in mdm, not found in mdm
false_positive = sum(~df_subset_predicted_true['MDM'])
# false negative -> predicted not in mdm, found in mdm
false_negative = sum(df_false['MDM'])
tp_accumulate = tp_accumulate + true_positive
tn_accumulate = tn_accumulate + true_negative
fp_accumulate = fp_accumulate + false_positive
fn_accumulate = fn_accumulate + false_negative
total_sum = (tp_accumulate + tn_accumulate + fp_accumulate + fn_accumulate)
# ensure that all entries are accounted for
assert(total_sum == len(self.input_df))
return tp_accumulate, tn_accumulate, fp_accumulate, fn_accumulate