From 481bcf88b75ac9b0b6854e3aebd75da1aad96e51 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Thu, 12 Dec 2024 22:06:26 +0900 Subject: [PATCH] Feat: added embedding plot for coarse and fine-grained labels --- interpretation/fold_analysis_bert_complete.py | 53 ++++++- interpretation/fold_analysis_bert_pattern.py | 50 ++++++- interpretation/fold_analysis_t5.py | 140 +++++++++++++++++- interpretation/inference.py | 131 +++++++++++++++- 4 files changed, 358 insertions(+), 16 deletions(-) diff --git a/interpretation/fold_analysis_bert_complete.py b/interpretation/fold_analysis_bert_complete.py index 29431cb..6360cea 100644 --- a/interpretation/fold_analysis_bert_complete.py +++ b/interpretation/fold_analysis_bert_complete.py @@ -45,6 +45,27 @@ def generate_labels(df, mdm_list): df['labels'] = generate_labels(df, mdm_list) +# pattern labels +patterns = df['pattern'].to_list() +mdm_pattern_list = sorted(list(set(patterns))) + +def generate_pattern_labels(df, mdm_pattern_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['pattern']}" + try: + index = mdm_pattern_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list) + + + # rank labels by counts top_10_labels = df['labels'].value_counts()[0:10].index.to_list() @@ -52,8 +73,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list() input_df = df.iloc[indices].reset_index(drop=True) -# %% -input_df # %% def run(step): @@ -66,12 +85,13 @@ def run(step): # %% embeddings = (run(step=1200)) -labels = input_df['labels'] - # Reducing dimensions with t-SNE tsne = TSNE(n_components=2, random_state=0, perplexity=5) embeddings_2d = tsne.fit_transform(embeddings) +# t-sne plot with complete labels +labels = input_df['labels'] + # Create a color map from labels to colors unique_labels = np.unique(labels) colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) @@ -83,10 +103,33 @@ for label in unique_labels: idx = (labels == label) plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) -plt.title('2D t-SNE Visualization of Embeddings') +plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.legend(title='Group') plt.show() +# %% +# t-sne plot with pattern labels +labels = input_df['pattern_labels'] + +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +plt.legend(title='Group') +plt.show() + + + # %% diff --git a/interpretation/fold_analysis_bert_pattern.py b/interpretation/fold_analysis_bert_pattern.py index e15953b..8e3d1f0 100644 --- a/interpretation/fold_analysis_bert_pattern.py +++ b/interpretation/fold_analysis_bert_pattern.py @@ -45,6 +45,26 @@ def generate_labels(df, mdm_list): df['labels'] = generate_labels(df, mdm_list) +# pattern labels +patterns = df['pattern'].to_list() +mdm_pattern_list = sorted(list(set(patterns))) + +def generate_pattern_labels(df, mdm_pattern_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['pattern']}" + try: + index = mdm_pattern_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list) + + # rank labels by counts top_10_labels = df['labels'].value_counts()[0:10].index.to_list() @@ -52,8 +72,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list() input_df = df.iloc[indices].reset_index(drop=True) -# %% -input_df # %% def run(step): @@ -66,12 +84,13 @@ def run(step): # %% embeddings = (run(step=1200)) -labels = input_df['labels'] - # Reducing dimensions with t-SNE tsne = TSNE(n_components=2, random_state=0, perplexity=5) embeddings_2d = tsne.fit_transform(embeddings) +# t-sne plot with complete labels +labels = input_df['labels'] + # Create a color map from labels to colors unique_labels = np.unique(labels) colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) @@ -83,7 +102,28 @@ for label in unique_labels: idx = (labels == label) plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) -plt.title('2D t-SNE Visualization of Embeddings') +plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +plt.legend(title='Group') +plt.show() + +# %% +# t-sne plot with pattern labels +labels = input_df['pattern_labels'] + +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.legend(title='Group') diff --git a/interpretation/fold_analysis_t5.py b/interpretation/fold_analysis_t5.py index 40dad52..9144972 100644 --- a/interpretation/fold_analysis_t5.py +++ b/interpretation/fold_analysis_t5.py @@ -2,7 +2,7 @@ # %% import pandas as pd import os -from inference import Embedder_t5 +from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder import numpy as np from sklearn.manifold import TSNE import matplotlib.pyplot as plt @@ -19,6 +19,7 @@ data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) df = df[df['MDM']].reset_index(drop=True) +# %% # get target data data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv" train_df = pd.read_csv(data_path, skipinitialspace=True) @@ -45,6 +46,25 @@ def generate_labels(df, mdm_list): df['labels'] = generate_labels(df, mdm_list) +# pattern labels +patterns = df['pattern'].to_list() +mdm_pattern_list = sorted(list(set(patterns))) + +def generate_pattern_labels(df, mdm_pattern_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['pattern']}" + try: + index = mdm_pattern_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list) + # rank labels by counts top_10_labels = df['labels'].value_counts()[0:10].index.to_list() @@ -52,10 +72,11 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list() input_df = df.iloc[indices].reset_index(drop=True) + # %% def run(step): checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') - embedder = Embedder_t5(checkpoint_path) + embedder = Embedder_t5_encoder(checkpoint_path) embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) embedder.create_embedding() embeddings = embedder.embeddings @@ -63,12 +84,13 @@ def run(step): # %% embeddings = (run(step=1200)) -labels = input_df['labels'] - # Reducing dimensions with t-SNE tsne = TSNE(n_components=2, random_state=0, perplexity=5) embeddings_2d = tsne.fit_transform(embeddings) +# t-sne plot with complete labels +labels = input_df['labels'] + # Create a color map from labels to colors unique_labels = np.unique(labels) colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) @@ -80,10 +102,118 @@ for label in unique_labels: idx = (labels == label) plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) -plt.title('2D t-SNE Visualization of Embeddings') +plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.legend(title='Group') plt.show() +# %% +# t-sne plot with pattern labels +labels = input_df['pattern_labels'] + +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +plt.legend(title='Group') +plt.show() + +############################################## +# %% +# demonstrate decoding to correct output +step = 1200 +checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') +infer = Inference(checkpoint_path) +infer.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) +thing_prediction_list, property_prediction_list = infer.generate() + +# add labels too +# thing_actual_list, property_actual_list = decode_preds(pred_labels) +# Convert the list to a Pandas DataFrame +df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list +}) +# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] +# df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] +input_df = pd.concat([input_df, df_out], axis=1) + +condition_correct_thing = input_df['p_thing'] == input_df['thing'] +condition_correct_property = input_df['p_property'] == input_df['property'] +prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property) +pred_correct_proportion = prediction_mdm_correct/len(input_df) +print(pred_correct_proportion) + +# %% +input_df[['thing', 'p_thing', 'property', 'p_property']] + + +# %% +# def run(step): +# checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') +# embedder = Embedder_t5_decoder(checkpoint_path) +# embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) +# embedder.create_embedding() +# embeddings = embedder.embeddings +# return embeddings +# +# # %% +# embeddings = (run(step=1200)) +# # Reducing dimensions with t-SNE +# tsne = TSNE(n_components=2, random_state=0, perplexity=5) +# embeddings_2d = tsne.fit_transform(embeddings) +# +# # t-sne plot with complete labels +# labels = input_df['labels'] +# +# # Create a color map from labels to colors +# unique_labels = np.unique(labels) +# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +# label_to_color = dict(zip(unique_labels, colors)) +# +# # Plotting +# plt.figure(figsize=(8, 6)) +# for label in unique_labels: +# idx = (labels == label) +# plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) +# +# plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels') +# plt.xlabel('Component 1') +# plt.ylabel('Component 2') +# plt.legend(title='Group') +# plt.show() +# +# # %% +# # t-sne plot with pattern labels +# labels = input_df['pattern_labels'] +# +# # Create a color map from labels to colors +# unique_labels = np.unique(labels) +# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +# label_to_color = dict(zip(unique_labels, colors)) +# +# # Plotting +# plt.figure(figsize=(8, 6)) +# for label in unique_labels: +# idx = (labels == label) +# plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) +# +# plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels') +# plt.xlabel('Component 1') +# plt.ylabel('Component 2') +# plt.legend(title='Group') +# plt.show() + + # %% diff --git a/interpretation/inference.py b/interpretation/inference.py index 3983a22..d0cb340 100644 --- a/interpretation/inference.py +++ b/interpretation/inference.py @@ -14,6 +14,7 @@ import numpy as np os.environ['TOKENIZERS_PARALLELISM'] = 'false' +torch.set_float32_matmul_precision('high') class Inference(): tokenizer: T5TokenizerFast @@ -169,8 +170,136 @@ class Inference(): thing_prediction_list, property_prediction_list = decode_preds(pred_generations) return thing_prediction_list, property_prediction_list +class Embedder_t5_decoder(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + embeddings: list -class Embedder_t5(): + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + self.embeddings = [] + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def create_embedding(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + pred_labels = [] + + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + # Manually create the decoder input (start token) + decoder_input_ids = self.tokenizer("", return_tensors="pt").input_ids + decoder_input_ids = torch.full((input_ids.size(0), len(decoder_input_ids)), self.model.config.decoder_start_token_id, dtype=torch.long).to(input_ids.device) + + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + # encoder_outputs = self.model.encoder( + # input_ids, + # attention_mask=attention_mask) + # # Use the hidden state of the first token as the sequence representation + # # pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + # outputs = self.model.decoder( + # input_ids=decoder_input_ids, + # encoder_hidden_states=encoder_outputs.last_hidden_state) + outputs = self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + first_token_logits = outputs.decoder_hidden_states[-1][:,-1,:] + self.embeddings.append(first_token_logits.to('cpu')) + + + self.embeddings = torch.cat(self.embeddings, dim=0) + + + +class Embedder_t5_encoder(): tokenizer: T5TokenizerFast model: torch.nn.Module dataloader: DataLoader