From 481bcf88b75ac9b0b6854e3aebd75da1aad96e51 Mon Sep 17 00:00:00 2001
From: Richard Wong <richard@richardwong.io>
Date: Thu, 12 Dec 2024 22:06:26 +0900
Subject: [PATCH] Feat: added embedding plot for coarse and fine-grained labels

---
 interpretation/fold_analysis_bert_complete.py |  53 ++++++-
 interpretation/fold_analysis_bert_pattern.py  |  50 ++++++-
 interpretation/fold_analysis_t5.py            | 140 +++++++++++++++++-
 interpretation/inference.py                   | 131 +++++++++++++++-
 4 files changed, 358 insertions(+), 16 deletions(-)

diff --git a/interpretation/fold_analysis_bert_complete.py b/interpretation/fold_analysis_bert_complete.py
index 29431cb..6360cea 100644
--- a/interpretation/fold_analysis_bert_complete.py
+++ b/interpretation/fold_analysis_bert_complete.py
@@ -45,6 +45,27 @@ def generate_labels(df, mdm_list):
 
 df['labels'] = generate_labels(df, mdm_list)
 
+# pattern labels
+patterns = df['pattern'].to_list()
+mdm_pattern_list = sorted(list(set(patterns)))
+
+def generate_pattern_labels(df, mdm_pattern_list):
+    output_list = []
+    for _, row in df.iterrows():
+        pattern = f"{row['pattern']}"
+        try:
+            index = mdm_pattern_list.index(pattern)
+        except ValueError:
+            print("Error: value not found in MDM list")
+            index = -1
+        output_list.append(index)
+
+    return output_list
+
+df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
+
+
+
 # rank labels by counts
 top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
 
@@ -52,8 +73,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
 
 input_df = df.iloc[indices].reset_index(drop=True)
 
-# %%
-input_df
 
 # %%
 def run(step):
@@ -66,12 +85,13 @@ def run(step):
 
 # %%
 embeddings = (run(step=1200))
-labels = input_df['labels']
-
 # Reducing dimensions with t-SNE
 tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 embeddings_2d = tsne.fit_transform(embeddings)
 
+# t-sne plot with complete labels
+labels = input_df['labels']
+
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
@@ -83,10 +103,33 @@ for label in unique_labels:
     idx = (labels == label)
     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 
-plt.title('2D t-SNE Visualization of Embeddings')
+plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 
+# %%
+# t-sne plot with pattern labels
+labels = input_df['pattern_labels']
+
+# Create a color map from labels to colors
+unique_labels = np.unique(labels)
+colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
+label_to_color = dict(zip(unique_labels, colors))
+
+# Plotting
+plt.figure(figsize=(8, 6))
+for label in unique_labels:
+    idx = (labels == label)
+    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
+
+plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
+plt.xlabel('Component 1')
+plt.ylabel('Component 2')
+plt.legend(title='Group')
+plt.show()
+
+
+
 # %%
diff --git a/interpretation/fold_analysis_bert_pattern.py b/interpretation/fold_analysis_bert_pattern.py
index e15953b..8e3d1f0 100644
--- a/interpretation/fold_analysis_bert_pattern.py
+++ b/interpretation/fold_analysis_bert_pattern.py
@@ -45,6 +45,26 @@ def generate_labels(df, mdm_list):
 
 df['labels'] = generate_labels(df, mdm_list)
 
+# pattern labels
+patterns = df['pattern'].to_list()
+mdm_pattern_list = sorted(list(set(patterns)))
+
+def generate_pattern_labels(df, mdm_pattern_list):
+    output_list = []
+    for _, row in df.iterrows():
+        pattern = f"{row['pattern']}"
+        try:
+            index = mdm_pattern_list.index(pattern)
+        except ValueError:
+            print("Error: value not found in MDM list")
+            index = -1
+        output_list.append(index)
+
+    return output_list
+
+df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
+
+
 # rank labels by counts
 top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
 
@@ -52,8 +72,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
 
 input_df = df.iloc[indices].reset_index(drop=True)
 
-# %%
-input_df
 
 # %%
 def run(step):
@@ -66,12 +84,13 @@ def run(step):
 
 # %%
 embeddings = (run(step=1200))
-labels = input_df['labels']
-
 # Reducing dimensions with t-SNE
 tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 embeddings_2d = tsne.fit_transform(embeddings)
 
+# t-sne plot with complete labels
+labels = input_df['labels']
+
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
@@ -83,7 +102,28 @@ for label in unique_labels:
     idx = (labels == label)
     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 
-plt.title('2D t-SNE Visualization of Embeddings')
+plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
+plt.xlabel('Component 1')
+plt.ylabel('Component 2')
+plt.legend(title='Group')
+plt.show()
+
+# %%
+# t-sne plot with pattern labels
+labels = input_df['pattern_labels']
+
+# Create a color map from labels to colors
+unique_labels = np.unique(labels)
+colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
+label_to_color = dict(zip(unique_labels, colors))
+
+# Plotting
+plt.figure(figsize=(8, 6))
+for label in unique_labels:
+    idx = (labels == label)
+    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
+
+plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
diff --git a/interpretation/fold_analysis_t5.py b/interpretation/fold_analysis_t5.py
index 40dad52..9144972 100644
--- a/interpretation/fold_analysis_t5.py
+++ b/interpretation/fold_analysis_t5.py
@@ -2,7 +2,7 @@
 # %%
 import pandas as pd
 import os
-from inference import Embedder_t5
+from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
 import numpy as np
 from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
@@ -19,6 +19,7 @@ data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
 df = pd.read_csv(data_path, skipinitialspace=True)
 df = df[df['MDM']].reset_index(drop=True)
 
+# %%
 # get target data
 data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)
@@ -45,6 +46,25 @@ def generate_labels(df, mdm_list):
 
 df['labels'] = generate_labels(df, mdm_list)
 
+# pattern labels
+patterns = df['pattern'].to_list()
+mdm_pattern_list = sorted(list(set(patterns)))
+
+def generate_pattern_labels(df, mdm_pattern_list):
+    output_list = []
+    for _, row in df.iterrows():
+        pattern = f"{row['pattern']}"
+        try:
+            index = mdm_pattern_list.index(pattern)
+        except ValueError:
+            print("Error: value not found in MDM list")
+            index = -1
+        output_list.append(index)
+
+    return output_list
+
+df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
+
 # rank labels by counts
 top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
 
@@ -52,10 +72,11 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
 
 input_df = df.iloc[indices].reset_index(drop=True)
 
+
 # %%
 def run(step):
     checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
-    embedder = Embedder_t5(checkpoint_path)
+    embedder = Embedder_t5_encoder(checkpoint_path)
     embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
     embedder.create_embedding()
     embeddings = embedder.embeddings
@@ -63,12 +84,13 @@ def run(step):
 
 # %%
 embeddings = (run(step=1200))
-labels = input_df['labels']
-
 # Reducing dimensions with t-SNE
 tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 embeddings_2d = tsne.fit_transform(embeddings)
 
+# t-sne plot with complete labels
+labels = input_df['labels']
+
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
@@ -80,10 +102,118 @@ for label in unique_labels:
     idx = (labels == label)
     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 
-plt.title('2D t-SNE Visualization of Embeddings')
+plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 
+# %%
+# t-sne plot with pattern labels
+labels = input_df['pattern_labels']
+
+# Create a color map from labels to colors
+unique_labels = np.unique(labels)
+colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
+label_to_color = dict(zip(unique_labels, colors))
+
+# Plotting
+plt.figure(figsize=(8, 6))
+for label in unique_labels:
+    idx = (labels == label)
+    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
+
+plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
+plt.xlabel('Component 1')
+plt.ylabel('Component 2')
+plt.legend(title='Group')
+plt.show()
+
+##############################################
+# %%
+# demonstrate decoding to correct output
+step = 1200
+checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
+infer = Inference(checkpoint_path)
+infer.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
+thing_prediction_list, property_prediction_list = infer.generate()
+
+# add labels too
+# thing_actual_list, property_actual_list = decode_preds(pred_labels)
+# Convert the list to a Pandas DataFrame
+df_out = pd.DataFrame({
+    'p_thing': thing_prediction_list, 
+    'p_property': property_prediction_list
+})
+# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
+# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
+input_df = pd.concat([input_df, df_out], axis=1)
+
+condition_correct_thing = input_df['p_thing'] == input_df['thing']
+condition_correct_property = input_df['p_property'] == input_df['property']
+prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property)
+pred_correct_proportion = prediction_mdm_correct/len(input_df)
+print(pred_correct_proportion)
+
+# %%
+input_df[['thing', 'p_thing', 'property', 'p_property']]
+
+
+# %%
+# def run(step):
+#     checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
+#     embedder = Embedder_t5_decoder(checkpoint_path)
+#     embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
+#     embedder.create_embedding()
+#     embeddings = embedder.embeddings
+#     return embeddings
+# 
+# # %%
+# embeddings = (run(step=1200))
+# # Reducing dimensions with t-SNE
+# tsne = TSNE(n_components=2, random_state=0, perplexity=5)
+# embeddings_2d = tsne.fit_transform(embeddings)
+# 
+# # t-sne plot with complete labels
+# labels = input_df['labels']
+# 
+# # Create a color map from labels to colors
+# unique_labels = np.unique(labels)
+# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
+# label_to_color = dict(zip(unique_labels, colors))
+# 
+# # Plotting
+# plt.figure(figsize=(8, 6))
+# for label in unique_labels:
+#     idx = (labels == label)
+#     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
+# 
+# plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
+# plt.xlabel('Component 1')
+# plt.ylabel('Component 2')
+# plt.legend(title='Group')
+# plt.show()
+# 
+# # %%
+# # t-sne plot with pattern labels
+# labels = input_df['pattern_labels']
+# 
+# # Create a color map from labels to colors
+# unique_labels = np.unique(labels)
+# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
+# label_to_color = dict(zip(unique_labels, colors))
+# 
+# # Plotting
+# plt.figure(figsize=(8, 6))
+# for label in unique_labels:
+#     idx = (labels == label)
+#     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
+# 
+# plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
+# plt.xlabel('Component 1')
+# plt.ylabel('Component 2')
+# plt.legend(title='Group')
+# plt.show()
+
+
 # %%
diff --git a/interpretation/inference.py b/interpretation/inference.py
index 3983a22..d0cb340 100644
--- a/interpretation/inference.py
+++ b/interpretation/inference.py
@@ -14,6 +14,7 @@ import numpy as np
 
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
+torch.set_float32_matmul_precision('high')
 
 class Inference():
     tokenizer: T5TokenizerFast
@@ -169,8 +170,136 @@ class Inference():
         thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
         return thing_prediction_list, property_prediction_list
 
+class Embedder_t5_decoder():
+    tokenizer: T5TokenizerFast
+    model: torch.nn.Module
+    dataloader: DataLoader
+    embeddings: list
 
-class Embedder_t5():
+    def __init__(self, checkpoint_path):
+        self._create_tokenizer()
+        self._load_model(checkpoint_path)
+        self.embeddings = []
+
+
+    def _create_tokenizer(self):
+        # %%
+        # load tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
+        # Define additional special tokens
+        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
+        # Add the additional special tokens to the tokenizer
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    def _load_model(self, checkpoint_path: str):
+        # load model
+        # Define the directory and the pattern
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
+        model = torch.compile(model)
+        # set model to eval
+        self.model = model.eval()
+
+
+
+
+    def prepare_dataloader(self, input_df, batch_size, max_length):
+        """
+        *arguments*
+        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
+        - batch_size: the batch size of dataloader output
+        - max_length: length of tokenizer output
+        """
+        print("preparing dataloader")
+        # convert each dataframe row into a dictionary
+        # outputs a list of dictionaries
+
+        def _process_df(df):
+            output_list = []
+            for _, row in df.iterrows():
+                desc = f"<DESC>{row['tag_description']}<DESC>"
+                unit = f"<UNIT>{row['unit']}<UNIT>"
+                element = {
+                    'input' : f"{desc}{unit}",
+                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
+                }
+                output_list.append(element)
+
+            return output_list
+
+        def _preprocess_function(example):
+            input = example['input']
+            target = example['output']
+            # text_target sets the corresponding label to inputs
+            # there is no need to create a separate 'labels'
+            model_inputs = self.tokenizer(
+                input,
+                text_target=target, 
+                max_length=max_length,
+                return_tensors="pt",
+                padding='max_length',
+                truncation=True,
+            )
+            return model_inputs
+
+        test_dataset = Dataset.from_list(_process_df(input_df))
+
+
+        # map maps function to each "row" in the dataset
+        # aka the data in the immediate nesting
+        datasets = test_dataset.map(
+            _preprocess_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=test_dataset.column_names,
+        )
+        # datasets = _preprocess_function(test_dataset)
+        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # create dataloader
+        self.dataloader = DataLoader(datasets, batch_size=batch_size)
+
+
+    def create_embedding(self):
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        pred_labels = []
+
+        for batch in tqdm(self.dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            pred_labels.extend(batch['labels'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            # Manually create the decoder input (start token)
+            decoder_input_ids = self.tokenizer("<pad>", return_tensors="pt").input_ids
+            decoder_input_ids = torch.full((input_ids.size(0), len(decoder_input_ids)), self.model.config.decoder_start_token_id, dtype=torch.long).to(input_ids.device)
+
+            attention_mask = attention_mask.to(device)
+            self.model.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                # encoder_outputs = self.model.encoder(
+                #     input_ids,
+                #     attention_mask=attention_mask)
+                # # Use the hidden state of the first token as the sequence representation
+                # # pooled_output = encoder_outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
+                # outputs = self.model.decoder(
+                #     input_ids=decoder_input_ids,
+                #     encoder_hidden_states=encoder_outputs.last_hidden_state)
+                outputs = self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
+                first_token_logits = outputs.decoder_hidden_states[-1][:,-1,:]
+                self.embeddings.append(first_token_logits.to('cpu'))
+
+
+        self.embeddings = torch.cat(self.embeddings, dim=0)
+
+
+
+class Embedder_t5_encoder():
     tokenizer: T5TokenizerFast
     model: torch.nn.Module
     dataloader: DataLoader