Feat: added embedding plot for coarse and fine-grained labels

2024-12-12 22:06:26 +09:00 · 2024-12-12 22:06:26 +09:00 · 481bcf88b7
parent c64e4bccfc
commit 481bcf88b7
4 changed files with 358 additions and 16 deletions
--- a/interpretation/fold_analysis_bert_complete.py
+++ b/interpretation/fold_analysis_bert_complete.py
@ -45,6 +45,27 @@ def generate_labels(df, mdm_list):
 df['labels'] = generate_labels(df, mdm_list)
 # pattern labels
 patterns = df['pattern'].to_list()
 mdm_pattern_list = sorted(list(set(patterns)))
 def generate_pattern_labels(df, mdm_pattern_list):
    output_list = []
    for _, row in df.iterrows():
        pattern = f"{row['pattern']}"
        try:
            index = mdm_pattern_list.index(pattern)
        except ValueError:
            print("Error: value not found in MDM list")
            index = -1
        output_list.append(index)
    return output_list
 df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
 # rank labels by counts
 top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
@ -52,8 +73,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
 input_df = df.iloc[indices].reset_index(drop=True)
 # %%
 input_df
 # %%
 def run(step):
@ -66,12 +85,13 @@ def run(step):
 # %%
 embeddings = (run(step=1200))
 labels = input_df['labels']
 # Reducing dimensions with t-SNE
 tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 embeddings_2d = tsne.fit_transform(embeddings)
 # t-sne plot with complete labels
 labels = input_df['labels']
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
@ -83,10 +103,33 @@ for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
-plt.title('2D t-SNE Visualization of Embeddings')
+plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 # %%
 # t-sne plot with pattern labels
 labels = input_df['pattern_labels']
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
 label_to_color = dict(zip(unique_labels, colors))
 # Plotting
 plt.figure(figsize=(8, 6))
 for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 # %%
--- a/interpretation/fold_analysis_bert_pattern.py
+++ b/interpretation/fold_analysis_bert_pattern.py
@ -45,6 +45,26 @@ def generate_labels(df, mdm_list):
 df['labels'] = generate_labels(df, mdm_list)
 # pattern labels
 patterns = df['pattern'].to_list()
 mdm_pattern_list = sorted(list(set(patterns)))
 def generate_pattern_labels(df, mdm_pattern_list):
    output_list = []
    for _, row in df.iterrows():
        pattern = f"{row['pattern']}"
        try:
            index = mdm_pattern_list.index(pattern)
        except ValueError:
            print("Error: value not found in MDM list")
            index = -1
        output_list.append(index)
    return output_list
 df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
 # rank labels by counts
 top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
@ -52,8 +72,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
 input_df = df.iloc[indices].reset_index(drop=True)
 # %%
 input_df
 # %%
 def run(step):
@ -66,12 +84,13 @@ def run(step):
 # %%
 embeddings = (run(step=1200))
 labels = input_df['labels']
 # Reducing dimensions with t-SNE
 tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 embeddings_2d = tsne.fit_transform(embeddings)
 # t-sne plot with complete labels
 labels = input_df['labels']
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
@ -83,7 +102,28 @@ for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
-plt.title('2D t-SNE Visualization of Embeddings')
+plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 # %%
 # t-sne plot with pattern labels
 labels = input_df['pattern_labels']
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
 label_to_color = dict(zip(unique_labels, colors))
 # Plotting
 plt.figure(figsize=(8, 6))
 for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
--- a/interpretation/fold_analysis_t5.py
+++ b/interpretation/fold_analysis_t5.py
@ -2,7 +2,7 @@
 # %%
 import pandas as pd
 import os
-from inference import Embedder_t5
+from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
 import numpy as np
 from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
@ -19,6 +19,7 @@ data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
 df = pd.read_csv(data_path, skipinitialspace=True)
 df = df[df['MDM']].reset_index(drop=True)
 # %%
 # get target data
 data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)
@ -45,6 +46,25 @@ def generate_labels(df, mdm_list):
 df['labels'] = generate_labels(df, mdm_list)
 # pattern labels
 patterns = df['pattern'].to_list()
 mdm_pattern_list = sorted(list(set(patterns)))
 def generate_pattern_labels(df, mdm_pattern_list):
    output_list = []
    for _, row in df.iterrows():
        pattern = f"{row['pattern']}"
        try:
            index = mdm_pattern_list.index(pattern)
        except ValueError:
            print("Error: value not found in MDM list")
            index = -1
        output_list.append(index)
    return output_list
 df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
 # rank labels by counts
 top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
@ -52,10 +72,11 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
 input_df = df.iloc[indices].reset_index(drop=True)
 # %%
 def run(step):
    checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
-    embedder = Embedder_t5(checkpoint_path)
+    embedder = Embedder_t5_encoder(checkpoint_path)
    embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
    embedder.create_embedding()
    embeddings = embedder.embeddings
@ -63,12 +84,13 @@ def run(step):
 # %%
 embeddings = (run(step=1200))
 labels = input_df['labels']
 # Reducing dimensions with t-SNE
 tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 embeddings_2d = tsne.fit_transform(embeddings)
 # t-sne plot with complete labels
 labels = input_df['labels']
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
@ -80,10 +102,118 @@ for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
-plt.title('2D t-SNE Visualization of Embeddings')
+plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 # %%
 # t-sne plot with pattern labels
 labels = input_df['pattern_labels']
 # Create a color map from labels to colors
 unique_labels = np.unique(labels)
 colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
 label_to_color = dict(zip(unique_labels, colors))
 # Plotting
 plt.figure(figsize=(8, 6))
 for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
 plt.xlabel('Component 1')
 plt.ylabel('Component 2')
 plt.legend(title='Group')
 plt.show()
 ##############################################
 # %%
 # demonstrate decoding to correct output
 step = 1200
 checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
 infer = Inference(checkpoint_path)
 infer.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
 thing_prediction_list, property_prediction_list = infer.generate()
 # add labels too
 # thing_actual_list, property_actual_list = decode_preds(pred_labels)
 # Convert the list to a Pandas DataFrame
 df_out = pd.DataFrame({
    'p_thing': thing_prediction_list, 
    'p_property': property_prediction_list
 })
 # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
 # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
 input_df = pd.concat([input_df, df_out], axis=1)
 condition_correct_thing = input_df['p_thing'] == input_df['thing']
 condition_correct_property = input_df['p_property'] == input_df['property']
 prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property)
 pred_correct_proportion = prediction_mdm_correct/len(input_df)
 print(pred_correct_proportion)
 # %%
 input_df[['thing', 'p_thing', 'property', 'p_property']]
 # %%
 # def run(step):
 #     checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
 #     embedder = Embedder_t5_decoder(checkpoint_path)
 #     embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
 #     embedder.create_embedding()
 #     embeddings = embedder.embeddings
 #     return embeddings
 # 
 # # %%
 # embeddings = (run(step=1200))
 # # Reducing dimensions with t-SNE
 # tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 # embeddings_2d = tsne.fit_transform(embeddings)
 # 
 # # t-sne plot with complete labels
 # labels = input_df['labels']
 # 
 # # Create a color map from labels to colors
 # unique_labels = np.unique(labels)
 # colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
 # label_to_color = dict(zip(unique_labels, colors))
 # 
 # # Plotting
 # plt.figure(figsize=(8, 6))
 # for label in unique_labels:
 #     idx = (labels == label)
 #     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 # 
 # plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
 # plt.xlabel('Component 1')
 # plt.ylabel('Component 2')
 # plt.legend(title='Group')
 # plt.show()
 # 
 # # %%
 # # t-sne plot with pattern labels
 # labels = input_df['pattern_labels']
 # 
 # # Create a color map from labels to colors
 # unique_labels = np.unique(labels)
 # colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
 # label_to_color = dict(zip(unique_labels, colors))
 # 
 # # Plotting
 # plt.figure(figsize=(8, 6))
 # for label in unique_labels:
 #     idx = (labels == label)
 #     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 # 
 # plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
 # plt.xlabel('Component 1')
 # plt.ylabel('Component 2')
 # plt.legend(title='Group')
 # plt.show()
 # %%
--- a/interpretation/inference.py
+++ b/interpretation/inference.py
@ -14,6 +14,7 @@ import numpy as np
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 torch.set_float32_matmul_precision('high')
 class Inference():
    tokenizer: T5TokenizerFast
@ -169,8 +170,136 @@ class Inference():
        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
        return thing_prediction_list, property_prediction_list
 class Embedder_t5_decoder():
    tokenizer: T5TokenizerFast
    model: torch.nn.Module
    dataloader: DataLoader
    embeddings: list
-class Embedder_t5():
+    def __init__(self, checkpoint_path):
        self._create_tokenizer()
        self._load_model(checkpoint_path)
        self.embeddings = []
    def _create_tokenizer(self):
        # %%
        # load tokenizer
        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
        # Define additional special tokens
        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
        # Add the additional special tokens to the tokenizer
        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    def _load_model(self, checkpoint_path: str):
        # load model
        # Define the directory and the pattern
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
        model = torch.compile(model)
        # set model to eval
        self.model = model.eval()
    def prepare_dataloader(self, input_df, batch_size, max_length):
        """
        *arguments*
        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
        - batch_size: the batch size of dataloader output
        - max_length: length of tokenizer output
        """
        print("preparing dataloader")
        # convert each dataframe row into a dictionary
        # outputs a list of dictionaries
        def _process_df(df):
            output_list = []
            for _, row in df.iterrows():
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
                element = {
                    'input' : f"{desc}{unit}",
                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
                }
                output_list.append(element)
            return output_list
        def _preprocess_function(example):
            input = example['input']
            target = example['output']
            # text_target sets the corresponding label to inputs
            # there is no need to create a separate 'labels'
            model_inputs = self.tokenizer(
                input,
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
                padding='max_length',
                truncation=True,
            )
            return model_inputs
        test_dataset = Dataset.from_list(_process_df(input_df))
        # map maps function to each "row" in the dataset
        # aka the data in the immediate nesting
        datasets = test_dataset.map(
            _preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=test_dataset.column_names,
        )
        # datasets = _preprocess_function(test_dataset)
        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        # create dataloader
        self.dataloader = DataLoader(datasets, batch_size=batch_size)
    def create_embedding(self):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        pred_labels = []
        for batch in tqdm(self.dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            pred_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            # Manually create the decoder input (start token)
            decoder_input_ids = self.tokenizer("<pad>", return_tensors="pt").input_ids
            decoder_input_ids = torch.full((input_ids.size(0), len(decoder_input_ids)), self.model.config.decoder_start_token_id, dtype=torch.long).to(input_ids.device)
            attention_mask = attention_mask.to(device)
            self.model.to(device)
            # Perform inference
            with torch.no_grad():
                # encoder_outputs = self.model.encoder(
                #     input_ids,
                #     attention_mask=attention_mask)
                # # Use the hidden state of the first token as the sequence representation
                # # pooled_output = encoder_outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
                # outputs = self.model.decoder(
                #     input_ids=decoder_input_ids,
                #     encoder_hidden_states=encoder_outputs.last_hidden_state)
                outputs = self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
                first_token_logits = outputs.decoder_hidden_states[-1][:,-1,:]
                self.embeddings.append(first_token_logits.to('cpu'))
        self.embeddings = torch.cat(self.embeddings, dim=0)
 class Embedder_t5_encoder():
    tokenizer: T5TokenizerFast
    model: torch.nn.Module
    dataloader: DataLoader