From c64e4bccfc356fd68516eb2f7ab3c9d5c4412f75 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Thu, 12 Dec 2024 16:13:47 +0900 Subject: [PATCH] Feat: added embedding plots viewer for different models --- interpretation/.gitignore | 1 + .../.gitignore | 2 + .../train.py | 237 ++++++++++ .../.gitignore | 2 + .../train.py | 228 ++++++++++ interpretation/fold_analysis_bert_complete.py | 92 ++++ interpretation/fold_analysis_bert_multiple.py | 133 ++++++ interpretation/fold_analysis_bert_pattern.py | 92 ++++ interpretation/fold_analysis_t5.py | 89 ++++ interpretation/inference.py | 407 ++++++++++++++++++ .../mapping_t5_complete_desc_unit/.gitignore | 2 + .../mapping_t5_complete_desc_unit/train.py | 216 ++++++++++ 12 files changed, 1501 insertions(+) create mode 100644 interpretation/.gitignore create mode 100644 interpretation/classification_bert_complete_desc_unit/.gitignore create mode 100644 interpretation/classification_bert_complete_desc_unit/train.py create mode 100644 interpretation/classification_bert_pattern_desc_unit/.gitignore create mode 100644 interpretation/classification_bert_pattern_desc_unit/train.py create mode 100644 interpretation/fold_analysis_bert_complete.py create mode 100644 interpretation/fold_analysis_bert_multiple.py create mode 100644 interpretation/fold_analysis_bert_pattern.py create mode 100644 interpretation/fold_analysis_t5.py create mode 100644 interpretation/inference.py create mode 100644 interpretation/mapping_t5_complete_desc_unit/.gitignore create mode 100644 interpretation/mapping_t5_complete_desc_unit/train.py diff --git a/interpretation/.gitignore b/interpretation/.gitignore new file mode 100644 index 0000000..264daca --- /dev/null +++ b/interpretation/.gitignore @@ -0,0 +1 @@ +*__pycache__ \ No newline at end of file diff --git a/interpretation/classification_bert_complete_desc_unit/.gitignore b/interpretation/classification_bert_complete_desc_unit/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/interpretation/classification_bert_complete_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/interpretation/classification_bert_complete_desc_unit/train.py b/interpretation/classification_bert_complete_desc_unit/train.py new file mode 100644 index 0000000..c61db8d --- /dev/null +++ b/interpretation/classification_bert_complete_desc_unit/train.py @@ -0,0 +1,237 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% +class SaveModelCallback(TrainerCallback): + """Custom callback to save model weights at specific intervals during training.""" + def __init__(self, save_interval): + super().__init__() + self.save_interval = save_interval # save every 'save_interval' steps + + def on_step_end(self, args, state, control, **kwargs): + """This method is called at the end of each training step.""" + # Check if it's time to save (based on global_step and save_interval) + if state.global_step % self.save_interval == 0 and state.global_step > 0: + # Path where the model should be saved + output_dir = f"{args.output_dir}/checkpoint_{state.global_step}" + model = kwargs['model'] + model.save_pretrained(output_dir) + print(f"Model saved to {output_dir} at step {state.global_step}") + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + pattern = f"{row['thing'] + row['property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = 'checkpoint' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + # model_checkpoint = "distilbert/distilbert-base-uncased" + model_checkpoint = 'google-bert/bert-base-cased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="no", + save_strategy="no", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + max_steps=1201, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + callbacks=[SaveModelCallback(save_interval=200)] + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1]: + print(fold) + train(fold) + + +# %% diff --git a/interpretation/classification_bert_pattern_desc_unit/.gitignore b/interpretation/classification_bert_pattern_desc_unit/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/interpretation/classification_bert_pattern_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/interpretation/classification_bert_pattern_desc_unit/train.py b/interpretation/classification_bert_pattern_desc_unit/train.py new file mode 100644 index 0000000..16327cf --- /dev/null +++ b/interpretation/classification_bert_pattern_desc_unit/train.py @@ -0,0 +1,228 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +class SaveModelCallback(TrainerCallback): + """Custom callback to save model weights at specific intervals during training.""" + def __init__(self, save_interval): + super().__init__() + self.save_interval = save_interval # save every 'save_interval' steps + + def on_step_end(self, args, state, control, **kwargs): + """This method is called at the end of each training step.""" + # Check if it's time to save (based on global_step and save_interval) + if state.global_step % self.save_interval == 0 and state.global_step > 0: + # Path where the model should be saved + output_dir = f"{args.output_dir}/checkpoint_{state.global_step}" + model = kwargs['model'] + model.save_pretrained(output_dir) + print(f"Model saved to {output_dir} at step {state.global_step}") + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + model_checkpoint = 'google-bert/bert-base-cased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="no", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + max_steps=1200, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + callbacks=[SaveModelCallback(save_interval=200)] + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1]: + print(fold) + train(fold) + + +# %% diff --git a/interpretation/fold_analysis_bert_complete.py b/interpretation/fold_analysis_bert_complete.py new file mode 100644 index 0000000..29431cb --- /dev/null +++ b/interpretation/fold_analysis_bert_complete.py @@ -0,0 +1,92 @@ +# this code tries to analyze the embeddings of the encoder +# %% +import pandas as pd +import os +from inference import Embedder_bert +import numpy as np +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt + + +checkpoint_directory = 'classification_bert_complete_desc_unit/checkpoint' + +BATCH_SIZE = 512 + +fold = 1 +print(f"Inference for fold {fold}") +# import test data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +df = pd.read_csv(data_path, skipinitialspace=True) +df = df[df['MDM']].reset_index(drop=True) + +# get target data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) +# processing to help with selection later +train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + +# assign labels +df['thing_property'] = df['thing'] + " " + df['property'] +thing_property = df['thing_property'].to_list() +mdm_list = sorted(list(set(thing_property))) + +def generate_labels(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['thing_property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['labels'] = generate_labels(df, mdm_list) + +# rank labels by counts +top_10_labels = df['labels'].value_counts()[0:10].index.to_list() + +indices = df[df['labels'].isin(top_10_labels)].index.to_list() + +input_df = df.iloc[indices].reset_index(drop=True) + +# %% +input_df + +# %% +def run(step): + checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') + embedder = Embedder_bert(checkpoint_path) + embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) + embedder.create_embedding() + embeddings = embedder.embeddings + return embeddings + +# %% +embeddings = (run(step=1200)) +labels = input_df['labels'] + +# Reducing dimensions with t-SNE +tsne = TSNE(n_components=2, random_state=0, perplexity=5) +embeddings_2d = tsne.fit_transform(embeddings) + +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +plt.legend(title='Group') +plt.show() + +# %% diff --git a/interpretation/fold_analysis_bert_multiple.py b/interpretation/fold_analysis_bert_multiple.py new file mode 100644 index 0000000..f44427f --- /dev/null +++ b/interpretation/fold_analysis_bert_multiple.py @@ -0,0 +1,133 @@ +# this code tries to analyze the embeddings of the encoder +# %% +import pandas as pd +import os +import glob +from inference import Embedder_bert +import numpy as np +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt +import torch +from sklearn.preprocessing import StandardScaler + + +checkpoint_directory = 'classification_bert_complete_desc_unit/checkpoint' + +BATCH_SIZE = 512 + +fold = 1 +print(f"Inference for fold {fold}") +# import test data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +df = pd.read_csv(data_path, skipinitialspace=True) +df = df[df['MDM']].reset_index(drop=True) + +# get target data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) +# processing to help with selection later +train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + +# assign labels +df['thing_property'] = df['thing'] + " " + df['property'] +thing_property = df['thing_property'].to_list() +mdm_list = sorted(list(set(thing_property))) + +def generate_labels(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['thing_property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['labels'] = generate_labels(df, mdm_list) + +# rank labels by counts +top_1_labels = df['labels'].value_counts()[0:10].index.to_list() + +# indices = df[df['labels'].isin(top_1_labels)].index.to_list() +indices = df[df['labels'] == 56].index.to_list() + +input_df = df.iloc[indices].reset_index(drop=True) +# indices_2 = df[df['labels'] == 381].index.to_list() +# indices.extend(indices_2) + +# %% +input_df + +# %% +def run(step): + # run inference + # checkpoint + # Use glob to find matching paths + checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint-{step}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + + + embedder = Embedder_bert(checkpoint_path) + embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) + embedder.create_embedding() + embeddings = embedder.embeddings + + + # Example embeddings array + size = len(embeddings) + labels = [f'{step}' for i in range(size)] + return embeddings, labels + +# %% +embeddings = [] +labels = [] +for step in [200, 400, 600, 800]: + embeds, lbs = (run(step)) + embeddings.append(embeds) + labels.extend(lbs) + + +# %% +labels = np.array(labels) +embeddings = torch.cat(embeddings, dim=0) + + + +# %% +# Reducing dimensions with t-SNE +tsne = TSNE(n_components=2, random_state=0, perplexity=5) +embeddings_2d = tsne.fit_transform(embeddings) + +# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5) +# plt.xlim([embeddings_2d[:, 0].min() - 1, embeddings_2d[:, 0].max() + 1]) +# plt.ylim([embeddings_2d[:, 1].min() - 1, embeddings_2d[:, 1].max() + 1]) +# plt.show() + +# %% +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + + + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +# plt.xlim([embeddings_2d[:, 0].min() - 1, embeddings_2d[:, 0].max() + 1]) +# plt.ylim([embeddings_2d[:, 1].min() - 1, embeddings_2d[:, 1].max() + 1]) +plt.legend(title='Group') +plt.show() + +# %% diff --git a/interpretation/fold_analysis_bert_pattern.py b/interpretation/fold_analysis_bert_pattern.py new file mode 100644 index 0000000..e15953b --- /dev/null +++ b/interpretation/fold_analysis_bert_pattern.py @@ -0,0 +1,92 @@ +# this code tries to analyze the embeddings of the encoder +# %% +import pandas as pd +import os +from inference import Embedder_bert +import numpy as np +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt + + +checkpoint_directory = 'classification_bert_pattern_desc_unit/checkpoint' + +BATCH_SIZE = 512 + +fold = 1 +print(f"Inference for fold {fold}") +# import test data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +df = pd.read_csv(data_path, skipinitialspace=True) +df = df[df['MDM']].reset_index(drop=True) + +# get target data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) +# processing to help with selection later +train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + +# assign labels +df['thing_property'] = df['thing'] + " " + df['property'] +thing_property = df['thing_property'].to_list() +mdm_list = sorted(list(set(thing_property))) + +def generate_labels(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['thing_property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['labels'] = generate_labels(df, mdm_list) + +# rank labels by counts +top_10_labels = df['labels'].value_counts()[0:10].index.to_list() + +indices = df[df['labels'].isin(top_10_labels)].index.to_list() + +input_df = df.iloc[indices].reset_index(drop=True) + +# %% +input_df + +# %% +def run(step): + checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') + embedder = Embedder_bert(checkpoint_path) + embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) + embedder.create_embedding() + embeddings = embedder.embeddings + return embeddings + +# %% +embeddings = (run(step=1200)) +labels = input_df['labels'] + +# Reducing dimensions with t-SNE +tsne = TSNE(n_components=2, random_state=0, perplexity=5) +embeddings_2d = tsne.fit_transform(embeddings) + +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +plt.legend(title='Group') +plt.show() + +# %% diff --git a/interpretation/fold_analysis_t5.py b/interpretation/fold_analysis_t5.py new file mode 100644 index 0000000..40dad52 --- /dev/null +++ b/interpretation/fold_analysis_t5.py @@ -0,0 +1,89 @@ +# this code tries to analyze the embeddings of the encoder +# %% +import pandas as pd +import os +from inference import Embedder_t5 +import numpy as np +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt + + +checkpoint_directory = 'mapping_t5_complete_desc_unit/checkpoint' + +BATCH_SIZE = 512 + +fold = 1 +print(f"Inference for fold {fold}") +# import test data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +df = pd.read_csv(data_path, skipinitialspace=True) +df = df[df['MDM']].reset_index(drop=True) + +# get target data +data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) +# processing to help with selection later +train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + +# assign labels +df['thing_property'] = df['thing'] + " " + df['property'] +thing_property = df['thing_property'].to_list() +mdm_list = sorted(list(set(thing_property))) + +def generate_labels(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + pattern = f"{row['thing_property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + output_list.append(index) + + return output_list + +df['labels'] = generate_labels(df, mdm_list) + +# rank labels by counts +top_10_labels = df['labels'].value_counts()[0:10].index.to_list() + +indices = df[df['labels'].isin(top_10_labels)].index.to_list() + +input_df = df.iloc[indices].reset_index(drop=True) + +# %% +def run(step): + checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') + embedder = Embedder_t5(checkpoint_path) + embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) + embedder.create_embedding() + embeddings = embedder.embeddings + return embeddings + +# %% +embeddings = (run(step=1200)) +labels = input_df['labels'] + +# Reducing dimensions with t-SNE +tsne = TSNE(n_components=2, random_state=0, perplexity=5) +embeddings_2d = tsne.fit_transform(embeddings) + +# Create a color map from labels to colors +unique_labels = np.unique(labels) +colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) +label_to_color = dict(zip(unique_labels, colors)) + +# Plotting +plt.figure(figsize=(8, 6)) +for label in unique_labels: + idx = (labels == label) + plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) + +plt.title('2D t-SNE Visualization of Embeddings') +plt.xlabel('Component 1') +plt.ylabel('Component 2') +plt.legend(title='Group') +plt.show() + +# %% diff --git a/interpretation/inference.py b/interpretation/inference.py new file mode 100644 index 0000000..3983a22 --- /dev/null +++ b/interpretation/inference.py @@ -0,0 +1,407 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForSequenceClassification, + +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + + +class Embedder_t5(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + embeddings: list + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + self.embeddings = [] + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def create_embedding(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + # Use the hidden state of the first token as the sequence representation + pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + self.embeddings.append(pooled_output.to('cpu')) + + self.embeddings = torch.cat(self.embeddings, dim=0) + + +class Embedder_bert(): + tokenizer: AutoTokenizer + model: torch.nn.Module + dataloader: DataLoader + embeddings: list + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + self.embeddings = [] + + + def _create_tokenizer(self): + # %% + # load tokenizer + + self.tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-cased', return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + + def _process_df(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def create_embedding(self): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + # get last layer + encoder_outputs = self.model.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True) + # Use the hidden state of the first token as the sequence representation + pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + self.embeddings.append(pooled_output.to('cpu')) + + self.embeddings = torch.cat(self.embeddings, dim=0) + diff --git a/interpretation/mapping_t5_complete_desc_unit/.gitignore b/interpretation/mapping_t5_complete_desc_unit/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/interpretation/mapping_t5_complete_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/interpretation/mapping_t5_complete_desc_unit/train.py b/interpretation/mapping_t5_complete_desc_unit/train.py new file mode 100644 index 0000000..bf2f3a7 --- /dev/null +++ b/interpretation/mapping_t5_complete_desc_unit/train.py @@ -0,0 +1,216 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments, + TrainerCallback +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +class SaveModelCallback(TrainerCallback): + """Custom callback to save model weights at specific intervals during training.""" + def __init__(self, save_interval): + super().__init__() + self.save_interval = save_interval # save every 'save_interval' steps + + def on_step_end(self, args, state, control, **kwargs): + """This method is called at the end of each training step.""" + # Check if it's time to save (based on global_step and save_interval) + if state.global_step % self.save_interval == 0 and state.global_step > 0: + # Path where the model should be saved + output_dir = f"{args.output_dir}/checkpoint_{state.global_step}" + model = kwargs['model'] + model.save_pretrained(output_dir) + print(f"Model saved to {output_dir} at step {state.global_step}") + + + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = { + 'input' : f"{desc}{unit}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = 'checkpoint' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 64 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="no", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + max_steps=1200, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + callbacks=[SaveModelCallback(save_interval=200)] + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1]: + print(fold) + train(fold) +