From bb3ddfaa2fc00a2bcf813035cb7667d08cc0f66d Mon Sep 17 00:00:00 2001
From: Richard Wong <richard@richardwong.io>
Date: Mon, 11 Nov 2024 02:18:57 +0900
Subject: [PATCH] Feat: include basic ood similarity analysis using bert

---
 analysis/bert/utils.py             |  21 ++-
 analysis/categories/label_print.py |  12 ++
 analysis/t5/utils.py               |  21 ++-
 post_process/ood/.gitignore        |   1 +
 post_process/ood/similarity.py     | 288 +++++++++++++++++++++++++++++
 post_process/ood/utils.py          |  81 ++++++++
 post_process/selection/utils.py    |  21 ++-
 7 files changed, 430 insertions(+), 15 deletions(-)
 create mode 100644 analysis/categories/label_print.py
 create mode 100644 post_process/ood/.gitignore
 create mode 100644 post_process/ood/similarity.py
 create mode 100644 post_process/ood/utils.py

diff --git a/analysis/bert/utils.py b/analysis/bert/utils.py
index 7392376..f618b67 100644
--- a/analysis/bert/utils.py
+++ b/analysis/bert/utils.py
@@ -49,23 +49,34 @@ class Retriever:
 
         self.embeddings = all_embeddings
 
-def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
+def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
+    device = 'cuda'
     batch1_size = batch1.size(0)
     batch2_size = batch2.size(0)
+    batch2.to(device)
     
     # Prepare an empty tensor to store results
-    cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
+    cos_sim = torch.empty(batch1_size, batch2_size, device=device)
 
     # Process batch1 in chunks
     for i in range(0, batch1_size, chunk_size):
         batch1_chunk = batch1[i:i + chunk_size]  # Get chunk of batch1
         
+        batch1_chunk.to(device)
         # Expand batch1 chunk and entire batch2 for comparison
-        batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
-        batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        # batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
+        # batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        batch2_norms = batch2.norm(dim=1, keepdim=True)
+
         
         # Compute cosine similarity for the chunk and store it in the final tensor
-        cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+        # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+
+        # Compute cosine similarity by matrix multiplication and normalizing
+        sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
+        
+        # Store the results in the appropriate part of the final tensor
+        cos_sim[i:i + chunk_size] = sim_chunk
     
     return cos_sim
 
diff --git a/analysis/categories/label_print.py b/analysis/categories/label_print.py
new file mode 100644
index 0000000..7d649de
--- /dev/null
+++ b/analysis/categories/label_print.py
@@ -0,0 +1,12 @@
+# %%
+# we need to create the mdm_list
+# import the full mdm-only file
+import pandas as pd
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+mdm_list = sorted(list((set(full_df['pattern']))))
+
+
+# %%
+mdm_list
+# %%
diff --git a/analysis/t5/utils.py b/analysis/t5/utils.py
index 12a0ac5..745445c 100644
--- a/analysis/t5/utils.py
+++ b/analysis/t5/utils.py
@@ -53,23 +53,34 @@ class Retriever:
 
         self.embeddings = all_embeddings
 
-def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
+def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
+    device = 'cuda'
     batch1_size = batch1.size(0)
     batch2_size = batch2.size(0)
+    batch2.to(device)
     
     # Prepare an empty tensor to store results
-    cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
+    cos_sim = torch.empty(batch1_size, batch2_size, device=device)
 
     # Process batch1 in chunks
     for i in range(0, batch1_size, chunk_size):
         batch1_chunk = batch1[i:i + chunk_size]  # Get chunk of batch1
         
+        batch1_chunk.to(device)
         # Expand batch1 chunk and entire batch2 for comparison
-        batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
-        batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        # batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
+        # batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        batch2_norms = batch2.norm(dim=1, keepdim=True)
+
         
         # Compute cosine similarity for the chunk and store it in the final tensor
-        cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+        # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+
+        # Compute cosine similarity by matrix multiplication and normalizing
+        sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
+        
+        # Store the results in the appropriate part of the final tensor
+        cos_sim[i:i + chunk_size] = sim_chunk
     
     return cos_sim
 
diff --git a/post_process/ood/.gitignore b/post_process/ood/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/post_process/ood/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/post_process/ood/similarity.py b/post_process/ood/similarity.py
new file mode 100644
index 0000000..a11a16b
--- /dev/null
+++ b/post_process/ood/similarity.py
@@ -0,0 +1,288 @@
+# %%
+import pandas as pd
+from utils import Retriever, cosine_similarity_chunked
+import os
+import glob
+import numpy as np
+from tqdm import tqdm
+
+# %%
+fold = 1
+data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
+test_df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+class Embedder():
+    input_df: pd.DataFrame
+    fold: int
+
+    def __init__(self, input_df):
+        self.input_df = input_df
+
+
+    def make_embedding(self, checkpoint_path):
+
+        def generate_input_list(df):
+            input_list = []
+            for _, row in df.iterrows():
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                element = f"{desc}{unit}"
+                input_list.append(element)
+            return input_list
+
+        # prepare reference embed
+        train_data = list(generate_input_list(self.input_df))
+        # Define the directory and the pattern
+        retriever_train = Retriever(train_data, checkpoint_path)
+        retriever_train.make_embedding(batch_size=64)
+        return retriever_train.embeddings.to('cpu')
+
+
+
+# %%
+data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+checkpoint_directory = "../../train/classification_bert"
+directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+# Use glob to find matching paths
+# path is usually checkpoint_fold_1/checkpoint-<step number>
+# we are guaranteed to save only 1 checkpoint from training
+pattern = 'checkpoint-*'
+checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
+
+train_embedder = Embedder(input_df=train_df)
+train_embeds = train_embedder.make_embedding(checkpoint_path)
+
+test_embedder = Embedder(input_df=test_df)
+test_embeds = test_embedder.make_embedding(checkpoint_path)
+
+
+
+# %%
+# test embeds are inputs since we are looking back at train data
+cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=1024).cpu().numpy()
+
+
+# %%
+# the following function takes in a full cos_sim_matrix
+# condition_source: boolean selectors of the source embedding
+# condition_target: boolean selectors of the target embedding
+def find_closest(cos_sim_matrix, condition_source, condition_target):
+    # subset_matrix = cos_sim_matrix[condition_source]
+    # except we are subsetting 2D matrix (row, column)
+    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
+    # we select top k here
+    # Get the indices of the top 5 maximum values along axis 1
+    top_k = 3
+    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
+    # note that top_k_indices is a nested list because of the 2d nature of the matrix
+    # the result is flipped
+    top_k_indices[0] = top_k_indices[0][::-1]
+    
+    # Get the values of the top 5 maximum scores
+    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
+    
+
+    return top_k_indices, top_k_values
+
+
+####################################################
+# special find-back code
+# %%
+def find_back_element_with_print(select_idx):
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
+    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
+    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
+    test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
+    predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
+    predicted_test_data = predicted_test_data.to_list()[0]
+
+    print("*" * 80)
+    print("idx:", select_idx)
+    print("train desc", training_desc_list)
+    print("train thing+property", training_data_pattern_list)
+    print("test desc", test_desc_list)
+    print("test thing+property", test_data_pattern_list)
+    print("predicted thing+property", predicted_test_data)
+    print("ships idx", test_ship_id)
+    print("score:", top_k_values[0])
+
+    test_pattern = test_data_pattern_list[0]
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        return True
+    else:
+        return False
+
+
+# %%
+def find_back_element(select_idx):
+    in_train_flag = False
+    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
+    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
+
+    top_k_indices, top_k_values = find_closest(
+        cos_sim_matrix=cos_sim_matrix,
+        condition_source=condition_source,
+        condition_target=condition_target)
+
+    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list()
+
+    test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list()
+    # just to convert the series format to string
+    test_pattern = test_data_pattern_list[0]
+    # print(training_data_pattern_list)
+    # print(test_data_pattern_list)
+
+
+    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
+
+    if sum(find_back_list) > 0:
+        in_train_flag = True
+    else:
+        in_train_flag = False 
+    
+    return in_train_flag, top_k_values[0][0]
+
+# %%
+in_train_list = []
+sim_list = []
+for select_idx in tqdm(test_df.index):
+    in_train_flag, top_sim_value = find_back_element(select_idx)
+    in_train_list.append(in_train_flag)
+    sim_list.append(top_sim_value)
+
+# analysis 1: using threshold to perform find-back prediction success
+# %%
+threshold = 0.9
+predict_list = [ elem > threshold for elem in sim_list ]
+
+# %%
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+y_true = in_train_list
+y_pred = predict_list
+
+# Compute metrics
+accuracy = accuracy_score(y_true, y_pred)
+f1 = f1_score(y_true, y_pred, average='macro')
+precision = precision_score(y_true, y_pred, average='macro')
+recall = recall_score(y_true, y_pred, average='macro')
+
+# Print the results
+print(f'Accuracy: {accuracy:.5f}')
+print(f'F1 Score: {f1:.5f}')
+print(f'Precision: {precision:.5f}')
+print(f'Recall: {recall:.5f}')
+
+# analysis 2: using find-back class to check distribution of similarities
+# %%
+sim_list_true = []
+sim_list_false = []
+for idx, elem in enumerate(in_train_list):
+    # true condition
+    if elem:
+        sim_list_true.append(sim_list[idx])
+    else:
+        sim_list_false.append(sim_list[idx])
+
+# %%
+import matplotlib.pyplot as plt
+
+# Sample data
+list1 = sim_list_true
+list2 = sim_list_false
+
+# Plot histograms
+bins = 50
+plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=False)
+plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=False)
+
+# Labels and legend
+plt.xlabel('Value')
+plt.ylabel('Frequency')
+plt.legend(loc='upper right')
+plt.title('Histograms of in-dist and out-dist similarities')
+
+# Show plot
+plt.show()
+
+# analysis 3
+# MDM result
+# MDM is not an accurate measure due to inconsistencies in training and test
+# distributions
+# e.g. training is a subset of MDM data, but test could contain MDM data not
+# found in train, therefore we cannot possibly achieve perfect prediction of
+# 'MDM' data
+
+# it is more accurate to use the result obtained from the find-back search
+# %%
+# there are 2183 actual datasets
+sum(test_df['MDM'])
+# %%
+# we find 3079 to be similar to the training distribution
+sum(predict_list)
+# %%
+# in actuality only 2051 are similar to the training distribution enough to find
+# answers during find-back
+sum(in_train_list)
+# %%
+# out of predicted, 1947 are mdm
+# by setting a threshold, we are able to get 95% of 2051
+sum(test_df[predict_list]['MDM'])
+
+# %%
+# out of find-back labels, 2051 are mdm
+# this represents the limit of the data distributional differences
+sum(test_df[in_train_list]['MDM'])
+
+# analysis 4
+# check if similarity is different between mdm and non-mdm
+# this also checks the validity of the selection approach
+# %%
+sim_list_true = []
+sim_list_false = []
+in_mdm_list = test_df['MDM'].to_list()
+for idx, elem in enumerate(in_mdm_list):
+    # true condition
+    if elem:
+        sim_list_true.append(sim_list[idx])
+    else:
+        sim_list_false.append(sim_list[idx])
+
+# %%
+import matplotlib.pyplot as plt
+
+# Sample data
+list1 = sim_list_true
+list2 = sim_list_false
+
+# Plot histograms
+bins = 50
+plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=False)
+plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=False)
+
+# Labels and legend
+plt.xlabel('Value')
+plt.ylabel('Frequency')
+plt.legend(loc='upper right')
+plt.title('Histograms of in-dist and out-dist similarities')
+
+# Show plot
+plt.show()
+
+
+# %%
diff --git a/post_process/ood/utils.py b/post_process/ood/utils.py
new file mode 100644
index 0000000..98749be
--- /dev/null
+++ b/post_process/ood/utils.py
@@ -0,0 +1,81 @@
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import torch.nn.functional as F
+
+
+
+class Retriever:
+    def __init__(self, input_texts, model_checkpoint):
+        # we need to generate the embedding from list of input strings
+        self.embeddings = []
+        self.inputs = input_texts
+        model_checkpoint = model_checkpoint 
+        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+
+        model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # device = "cpu"
+        model.to(self.device)
+        self.model = model.eval()
+
+
+    def make_embedding(self, batch_size=64):
+        all_embeddings = self.embeddings
+        input_texts = self.inputs
+
+        for i in range(0, len(input_texts), batch_size):
+            batch_texts = input_texts[i:i+batch_size]
+            # Tokenize the input text
+            inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
+            input_ids = inputs.input_ids.to(self.device)
+            attention_mask = inputs.attention_mask.to(self.device)
+
+
+            # Pass the input through the encoder and retrieve the embeddings
+            with torch.no_grad():
+                encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+                # get last layer
+                embeddings = encoder_outputs.hidden_states[-1]
+                # get cls token embedding
+                cls_embeddings = embeddings[:, 0, :]  # Shape: (batch_size, hidden_size)
+                all_embeddings.append(cls_embeddings)
+        
+        # remove the batch list and makes a single large tensor, dim=0 increases row-wise
+        all_embeddings = torch.cat(all_embeddings, dim=0)
+
+        self.embeddings = all_embeddings
+
+def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
+    device = 'cuda'
+    batch1_size = batch1.size(0)
+    batch2_size = batch2.size(0)
+    batch2.to(device)
+    
+    # Prepare an empty tensor to store results
+    cos_sim = torch.empty(batch1_size, batch2_size, device=device)
+
+    # Process batch1 in chunks
+    for i in range(0, batch1_size, chunk_size):
+        batch1_chunk = batch1[i:i + chunk_size]  # Get chunk of batch1
+        
+        batch1_chunk.to(device)
+        # Expand batch1 chunk and entire batch2 for comparison
+        # batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
+        # batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        batch2_norms = batch2.norm(dim=1, keepdim=True)
+
+        
+        # Compute cosine similarity for the chunk and store it in the final tensor
+        # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+
+        # Compute cosine similarity by matrix multiplication and normalizing
+        sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
+        
+        # Store the results in the appropriate part of the final tensor
+        cos_sim[i:i + chunk_size] = sim_chunk
+    
+    return cos_sim
\ No newline at end of file
diff --git a/post_process/selection/utils.py b/post_process/selection/utils.py
index b2f2116..a59e8f2 100644
--- a/post_process/selection/utils.py
+++ b/post_process/selection/utils.py
@@ -54,23 +54,34 @@ class Retriever:
 
         self.embeddings = all_embeddings
 
-def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
+def cosine_similarity_chunked(batch1, batch2, chunk_size=1024):
+    device = 'cuda'
     batch1_size = batch1.size(0)
     batch2_size = batch2.size(0)
+    batch2.to(device)
     
     # Prepare an empty tensor to store results
-    cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
+    cos_sim = torch.empty(batch1_size, batch2_size, device=device)
 
     # Process batch1 in chunks
     for i in range(0, batch1_size, chunk_size):
         batch1_chunk = batch1[i:i + chunk_size]  # Get chunk of batch1
         
+        batch1_chunk.to(device)
         # Expand batch1 chunk and entire batch2 for comparison
-        batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
-        batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        # batch1_chunk_exp = batch1_chunk.unsqueeze(1)  # Shape: (chunk_size, 1, seq_len)
+        # batch2_exp = batch2.unsqueeze(0)  # Shape: (1, batch2_size, seq_len)
+        batch2_norms = batch2.norm(dim=1, keepdim=True)
+
         
         # Compute cosine similarity for the chunk and store it in the final tensor
-        cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+        # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
+
+        # Compute cosine similarity by matrix multiplication and normalizing
+        sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8)
+        
+        # Store the results in the appropriate part of the final tensor
+        cos_sim[i:i + chunk_size] = sim_chunk
     
     return cos_sim