diff --git a/analysis/find_closest.py b/analysis/find_closest.py index e3552dd..8dbc65f 100644 --- a/analysis/find_closest.py +++ b/analysis/find_closest.py @@ -107,7 +107,7 @@ def find_closest(cos_sim_matrix, condition_source, condition_target): subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] # we select top k here # Get the indices of the top 5 maximum values along axis 1 - top_k = 5 + top_k = 3 top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values # note that top_k_indices is a nested list because of the 2d nature of the matrix # the result is flipped @@ -135,15 +135,20 @@ def find_back_element_with_print(select_idx): condition_target=condition_target) training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list() + training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list() test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list() - predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + test_df[test_df.index == select_idx]['p_property'] + test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list() + predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property'] + predicted_test_data = predicted_test_data.to_list()[0] print("*" * 80) print("idx:", select_idx) - print(training_data_pattern_list) - print(test_data_pattern_list) - print(predicted_test_data) + print("train desc", training_desc_list) + print("train thing+property", training_data_pattern_list) + print("test desc", test_desc_list) + print("test thing+property", test_data_pattern_list) + print("predicted thing+property", predicted_test_data) test_pattern = test_data_pattern_list[0] @@ -154,7 +159,7 @@ def find_back_element_with_print(select_idx): else: return False -find_back_element_with_print(2884) +find_back_element_with_print(0) # %% def find_back_element(select_idx): @@ -194,15 +199,13 @@ for select_idx in error_thing_df.index: print("status:", result) pattern_in_train.append(result) -# %% -sum(pattern_in_train)/len(pattern_in_train) - ### # for error property # %% pattern_in_train = [] for select_idx in error_property_df.index: - result = find_back_element(select_idx) + result = find_back_element_with_print(select_idx) + print("status:", result) pattern_in_train.append(result) # %% diff --git a/train/classification_bert/.gitignore b/train/classification_bert/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/train/classification_bert/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/train/classification_bert/classification_prediction/predict.py b/train/classification_bert/classification_prediction/predict.py new file mode 100644 index 0000000..fbed1b3 --- /dev/null +++ b/train/classification_bert/classification_prediction/predict.py @@ -0,0 +1,228 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold, mdm_list): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + # we only use the mdm subset + test_df = test_df[test_df['MDM']].reset_index(drop=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +# def train(fold): +fold = 1 + +test_dataset = create_dataset(fold, mdm_list) + +# prepare tokenizer + +checkpoint_directory = f'../checkpoint_fold_{fold}' +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + +tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) +# Define additional special tokens +# additional_special_tokens = ["", "", "", "", "", "", "", "", ""] +# Add the additional special tokens to the tokenizer +# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + +# %% +# compute max token length +max_length = 0 +for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + +print(max_length) + +# %% + +max_length = 64 + +# given a dataset entry, run it through the tokenizer +def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + +# map maps function to each "row" in the dataset +# aka the data in the immediate nesting +datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", +) + + +datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + +# %% temp +# tokenized_datasets['train'].rename_columns() + +# %% +# create data collator + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + +# %% +# compute metrics +# metric = evaluate.load("accuracy") +# +# +# def compute_metrics(eval_preds): +# preds, labels = eval_preds +# preds = np.argmax(preds, axis=1) +# return metric.compute(predictions=preds, references=labels) + +model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) +# important! after extending tokens vocab +model.resize_token_embeddings(len(tokenizer)) + +model = model.eval() + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +model.to(device) + +pred_labels = [] +actual_labels = [] + + +BATCH_SIZE = 64 +dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) +for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + +pred_labels = [tensor.item() for tensor in pred_labels] + + +# %% +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +y_true = actual_labels +y_pred = pred_labels + +# Compute metrics +accuracy = accuracy_score(y_true, y_pred) +f1 = f1_score(y_true, y_pred, average='macro') +precision = precision_score(y_true, y_pred, average='macro') +recall = recall_score(y_true, y_pred, average='macro') + +# Print the results +print(f'Accuracy: {accuracy:.2f}') +print(f'F1 Score: {f1:.2f}') +print(f'Precision: {precision:.2f}') +print(f'Recall: {recall:.2f}') + + +# %% diff --git a/train/classification_bert/train.py b/train/classification_bert/train.py new file mode 100644 index 0000000..e87179a --- /dev/null +++ b/train/classification_bert/train.py @@ -0,0 +1,211 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +# def train(fold): +fold = 1 + +save_path = f'checkpoint_fold_{fold}' +split_datasets = create_split_dataset(fold, mdm_list) + +# prepare tokenizer + +model_checkpoint = "distilbert/distilbert-base-uncased" +tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) +# Define additional special tokens +# additional_special_tokens = ["", "", "", "", "", "", "", "", ""] +# Add the additional special tokens to the tokenizer +# tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + +max_length = 120 + +# given a dataset entry, run it through the tokenizer +def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + +# map maps function to each "row" in the dataset +# aka the data in the immediate nesting +tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", +) + +# %% temp +# tokenized_datasets['train'].rename_columns() +# %% temp +tokenized_datasets['train']['input_ids'] + +# %% +# create data collator + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +# %% +# compute metrics +metric = evaluate.load("accuracy") + + +def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + +# %% +# create id2label and label2id + + +# %% +model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) +# important! after extending tokens vocab +model.resize_token_embeddings(len(tokenizer)) + +# model = torch.compile(model, backend="inductor", dynamic=True) + + +# %% +# Trainer + +training_args = TrainingArguments( + output_dir=f"{save_path}", + eval_strategy="epoch", + logging_dir="tensorboard-log", + logging_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + learning_rate=2e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, +) + + +trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], +) + +# uncomment to load training from checkpoint +# checkpoint_path = 'default_40_1/checkpoint-5600' +# trainer.train(resume_from_checkpoint=checkpoint_path) + +trainer.train() + +# # execute training +# for fold in [1,2,3,4,5]: +# print(fold) +# train(fold) + + +# %% diff --git a/train/mapping_baseline/mapping_prediction/inference.py b/train/mapping_baseline/mapping_prediction/inference.py index 2232be0..35d6377 100644 --- a/train/mapping_baseline/mapping_prediction/inference.py +++ b/train/mapping_baseline/mapping_prediction/inference.py @@ -90,7 +90,7 @@ class Inference(): datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) # create dataloader - self.dataloader = DataLoader(datasets, batch_size=batch_size) + self.dataloader = DataLoader(datasets, batch_size=batch_size, shuffle=False) def generate(self):