From 086b867d917b4b46f2a00d93c619c954df350d72 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Tue, 24 Dec 2024 21:57:48 +0900 Subject: [PATCH] Feat: added overall section to evaluate combined accuracy - added relevant-class section --- overall/README.md | 2 + ...ned_mapping_and_classification_analysis.py | 34 +++ .../binary_classifier_desc/.gitignore | 2 + .../classification_prediction/.gitignore | 2 + .../classification_prediction/predict.py | 235 ++++++++++++++++++ .../binary_classifier_desc/train.py | 218 ++++++++++++++++ .../binary_classifier_desc_unit/.gitignore | 2 + .../classification_prediction/.gitignore | 2 + .../classification_prediction/predict.py | 235 ++++++++++++++++++ .../binary_classifier_desc_unit/train.py | 219 ++++++++++++++++ .../similarity_classifier_desc/.gitignore | 3 + .../similarity_classifier_desc/README.md | 4 + .../similarity_classifier_desc/run.py | 175 +++++++++++++ .../similarity_classifier_desc/utils.py | 81 ++++++ .../.gitignore | 3 + .../similarity_classifier_desc_unit/README.md | 4 + .../similarity_classifier_desc_unit/run.py | 176 +++++++++++++ .../similarity_classifier_desc_unit/utils.py | 81 ++++++ .../train.py | 6 +- .../train.py | 4 +- .../mapping_prediction/output.txt | 3 + .../mapping_prediction/output.txt | 6 +- .../mapping_prediction/predict.py | 2 +- .../train_decoder.py | 2 +- .../train_encoder.py | 6 +- .../mapping_prediction/output.txt | 10 +- .../train_decoder.py | 2 +- .../train_encoder.py | 8 +- .../mapping_prediction/predict.py | 1 + .../mapping_prediction/predict.py | 2 +- .../mapping_prediction/output.txt | 6 +- train/train.bash | 25 +- 32 files changed, 1531 insertions(+), 30 deletions(-) create mode 100644 overall/README.md create mode 100644 overall/combined_mapping_and_classification_analysis.py create mode 100644 relevant_class/binary_classifier_desc/.gitignore create mode 100644 relevant_class/binary_classifier_desc/classification_prediction/.gitignore create mode 100644 relevant_class/binary_classifier_desc/classification_prediction/predict.py create mode 100644 relevant_class/binary_classifier_desc/train.py create mode 100644 relevant_class/binary_classifier_desc_unit/.gitignore create mode 100644 relevant_class/binary_classifier_desc_unit/classification_prediction/.gitignore create mode 100644 relevant_class/binary_classifier_desc_unit/classification_prediction/predict.py create mode 100644 relevant_class/binary_classifier_desc_unit/train.py create mode 100644 relevant_class/similarity_classifier_desc/.gitignore create mode 100644 relevant_class/similarity_classifier_desc/README.md create mode 100644 relevant_class/similarity_classifier_desc/run.py create mode 100644 relevant_class/similarity_classifier_desc/utils.py create mode 100644 relevant_class/similarity_classifier_desc_unit/.gitignore create mode 100644 relevant_class/similarity_classifier_desc_unit/README.md create mode 100644 relevant_class/similarity_classifier_desc_unit/run.py create mode 100644 relevant_class/similarity_classifier_desc_unit/utils.py diff --git a/overall/README.md b/overall/README.md new file mode 100644 index 0000000..7c4648c --- /dev/null +++ b/overall/README.md @@ -0,0 +1,2 @@ +This section is to evaluate the combined (relevant-class prediction) and +(mapping prediction) to evaluate the final correct mapping accuracy. \ No newline at end of file diff --git a/overall/combined_mapping_and_classification_analysis.py b/overall/combined_mapping_and_classification_analysis.py new file mode 100644 index 0000000..6df1ea8 --- /dev/null +++ b/overall/combined_mapping_and_classification_analysis.py @@ -0,0 +1,34 @@ +# %% +import pandas as pd + +# following code computes final mapping + classification accuracy +# %% +def run(fold): + data_path = f'../relevant_class/binary_classifier_desc_unit/classification_prediction/exports/result_group_{fold}.csv' + df = pd.read_csv(data_path, skipinitialspace=True) + p_mdm = df['p_mdm'] + + # data_path = f'../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv' + data_path = f'../train/modified_t5_decoder_4_layers/mapping_prediction/exports/result_group_{fold}.csv' + df = pd.read_csv(data_path, skipinitialspace=True) + actual_mdm = df['MDM'] + + thing_correctness = df['thing'] == df['p_thing'] + property_correctness = df['property'] == df['p_property'] + answer = thing_correctness & property_correctness + + # if is non-MDM -> then should be unmapped + # if is MDM -> then should be mapped correctly + + # out of correctly predicted relevant data, how many are mapped correctly? + correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer) + + # number of correctly predicted non-relevant data + correct_negative_mdm = sum(~(p_mdm) & ~(actual_mdm)) + + overall_correct = (correct_positive_mdm_and_map + correct_negative_mdm)/len(actual_mdm) + print(overall_correct) +# %% +for fold in [1,2,3,4,5]: + run(fold) + diff --git a/relevant_class/binary_classifier_desc/.gitignore b/relevant_class/binary_classifier_desc/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/relevant_class/binary_classifier_desc/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/relevant_class/binary_classifier_desc/classification_prediction/.gitignore b/relevant_class/binary_classifier_desc/classification_prediction/.gitignore new file mode 100644 index 0000000..4d615d0 --- /dev/null +++ b/relevant_class/binary_classifier_desc/classification_prediction/.gitignore @@ -0,0 +1,2 @@ +exports +output.txt \ No newline at end of file diff --git a/relevant_class/binary_classifier_desc/classification_prediction/predict.py b/relevant_class/binary_classifier_desc/classification_prediction/predict.py new file mode 100644 index 0000000..bcf3155 --- /dev/null +++ b/relevant_class/binary_classifier_desc/classification_prediction/predict.py @@ -0,0 +1,235 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + in_mdm_label = int(row['MDM']) + element = { + 'text' : f"{desc}{unit}", + 'label': in_mdm_label, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=2) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + pred_labels = np.array(pred_labels, dtype=bool) + + # append the mdm prediction to the test_df for analysis later + df_out = pd.DataFrame({ + 'p_mdm': pred_labels, + }) + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + df_export = pd.concat([test_df, df_out], axis=1) + df_export.to_csv(f"exports/result_group_{fold}.csv", index=False) + + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + cm = confusion_matrix(y_true, y_pred) + tn, fp, fn, tp = cm.ravel() + + with open("output.txt", "a") as f: + + + print('*' * 80, file=f) + print(f'Fold: {fold}', file=f) + # Print the results + print(f"tp: {tp}", file=f) + print(f"tn: {tn}", file=f) + print(f"fp: {fp}", file=f) + print(f"fn: {fn}", file=f) + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + test(fold) diff --git a/relevant_class/binary_classifier_desc/train.py b/relevant_class/binary_classifier_desc/train.py new file mode 100644 index 0000000..0276a14 --- /dev/null +++ b/relevant_class/binary_classifier_desc/train.py @@ -0,0 +1,218 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +# data_path = '../../data_import/exports/data_mapping_mdm.csv' +# full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property + + +# %% +id2label = {0: False, 1: True} +label2id = {False: 0, True: 1} + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + in_mdm_label = int(row['MDM']) + element = { + 'text' : f"{desc}", + 'label': in_mdm_label, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + # data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + + # reconstruct full training data with non-mdm data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + ships_list = list(set(test_df['ships_idx'])) + data_path = '../../data_preprocess/exports/preprocessed_data.csv' + full_df = pd.read_csv(data_path, skipinitialspace=True) + train_df = full_df[~full_df['ships_idx'].isin(ships_list)] + + train_ships_list = sorted(list(set(train_df['ships_idx']))) + + train_ships_set = set(train_ships_list) + test_ships_set = set(ships_list) + + # assertion for non data leakage + assert not set(train_ships_set).intersection(test_ships_set) + + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-cased" + # model_checkpoint = 'google-bert/bert-base-cased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=2) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/relevant_class/binary_classifier_desc_unit/.gitignore b/relevant_class/binary_classifier_desc_unit/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/relevant_class/binary_classifier_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/relevant_class/binary_classifier_desc_unit/classification_prediction/.gitignore b/relevant_class/binary_classifier_desc_unit/classification_prediction/.gitignore new file mode 100644 index 0000000..4d615d0 --- /dev/null +++ b/relevant_class/binary_classifier_desc_unit/classification_prediction/.gitignore @@ -0,0 +1,2 @@ +exports +output.txt \ No newline at end of file diff --git a/relevant_class/binary_classifier_desc_unit/classification_prediction/predict.py b/relevant_class/binary_classifier_desc_unit/classification_prediction/predict.py new file mode 100644 index 0000000..bcf3155 --- /dev/null +++ b/relevant_class/binary_classifier_desc_unit/classification_prediction/predict.py @@ -0,0 +1,235 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + + +BATCH_SIZE = 256 + +# %% + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + in_mdm_label = int(row['MDM']) + element = { + 'text' : f"{desc}{unit}", + 'label': in_mdm_label, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 128 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=2) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + pred_labels = np.array(pred_labels, dtype=bool) + + # append the mdm prediction to the test_df for analysis later + df_out = pd.DataFrame({ + 'p_mdm': pred_labels, + }) + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + df_export = pd.concat([test_df, df_out], axis=1) + df_export.to_csv(f"exports/result_group_{fold}.csv", index=False) + + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + cm = confusion_matrix(y_true, y_pred) + tn, fp, fn, tp = cm.ravel() + + with open("output.txt", "a") as f: + + + print('*' * 80, file=f) + print(f'Fold: {fold}', file=f) + # Print the results + print(f"tp: {tp}", file=f) + print(f"tn: {tn}", file=f) + print(f"fp: {fp}", file=f) + print(f"fn: {fn}", file=f) + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + test(fold) diff --git a/relevant_class/binary_classifier_desc_unit/train.py b/relevant_class/binary_classifier_desc_unit/train.py new file mode 100644 index 0000000..58a8624 --- /dev/null +++ b/relevant_class/binary_classifier_desc_unit/train.py @@ -0,0 +1,219 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +# data_path = '../../data_import/exports/data_mapping_mdm.csv' +# full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property + + +# %% +id2label = {0: False, 1: True} +label2id = {False: 0, True: 1} + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + in_mdm_label = int(row['MDM']) + element = { + 'text' : f"{desc}{unit}", + 'label': in_mdm_label, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + # data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + + # reconstruct full training data with non-mdm data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + ships_list = list(set(test_df['ships_idx'])) + data_path = '../../data_preprocess/exports/preprocessed_data.csv' + full_df = pd.read_csv(data_path, skipinitialspace=True) + train_df = full_df[~full_df['ships_idx'].isin(ships_list)] + + train_ships_list = sorted(list(set(train_df['ships_idx']))) + + train_ships_set = set(train_ships_list) + test_ships_set = set(ships_list) + + # assertion for non data leakage + assert not set(train_ships_set).intersection(test_ships_set) + + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-cased" + # model_checkpoint = 'google-bert/bert-base-cased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=2) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=128, + per_device_eval_batch_size=128, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/relevant_class/similarity_classifier_desc/.gitignore b/relevant_class/similarity_classifier_desc/.gitignore new file mode 100644 index 0000000..1bc943b --- /dev/null +++ b/relevant_class/similarity_classifier_desc/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +exports +output.txt \ No newline at end of file diff --git a/relevant_class/similarity_classifier_desc/README.md b/relevant_class/similarity_classifier_desc/README.md new file mode 100644 index 0000000..371ad7d --- /dev/null +++ b/relevant_class/similarity_classifier_desc/README.md @@ -0,0 +1,4 @@ +# one-class classification by similarity + +Purpose: using only Ship Domain attributes, we want to find if the data belongs +to MDM \ No newline at end of file diff --git a/relevant_class/similarity_classifier_desc/run.py b/relevant_class/similarity_classifier_desc/run.py new file mode 100644 index 0000000..1e3b618 --- /dev/null +++ b/relevant_class/similarity_classifier_desc/run.py @@ -0,0 +1,175 @@ +# %% +import pandas as pd +from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np +from tqdm import tqdm +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + +################################################## +# helper functions + + +# the following function takes in a full cos_sim_matrix +# condition_source: boolean selectors of the source embedding +# condition_target: boolean selectors of the target embedding +def find_closest(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top k maximum values along axis 1 + top_k = 3 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # note that top_k_indices is a nested list because of the 2d nature of the matrix + # the result is flipped + top_k_indices[0] = top_k_indices[0][::-1] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + + return top_k_indices, top_k_values + + + + +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + + + +def run_similarity_classifier(fold): + data_path = f'../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + checkpoint_directory = "../../train/classification_bert_complete_desc" + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + test_embedder = Embedder(input_df=test_df) + test_embeds = test_embedder.make_embedding(checkpoint_path) + + def compute_top_k(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + + _, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + return top_k_values[0][0] + + + + # test embeds are inputs since we are looking back at train data + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=1024).cpu().numpy() + + + sim_list = [] + for select_idx in tqdm(test_df.index): + top_sim_value = compute_top_k(select_idx) + sim_list.append(top_sim_value) + + # analysis 1: using threshold to perform find-back prediction success + threshold_values = np.linspace(0.85, 1.00, 21) # test 20 values, 21 to get nice round numbers + best_threshold = 0 + best_f1 = 0 + for threshold in threshold_values: + predict_list = [ elem > threshold for elem in sim_list ] + + y_true = test_df['MDM'].to_list() + y_pred = predict_list + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + if f1 > best_f1: + best_threshold = threshold + best_f1 = f1 + + # OR just manually set best_threshold + # best_threshold = 0.90 + + # compute metrics again with best threshold + predict_list = [ elem > best_threshold for elem in sim_list ] + + # save + pred_labels = np.array(predict_list, dtype=bool) + + # append the mdm prediction to the test_df for analysis later + df_out = pd.DataFrame({ + 'p_mdm': pred_labels, + }) + df_out.to_csv(f"exports/result_group_{fold}.csv", index=False) + + + y_true = test_df['MDM'].to_list() + y_pred = predict_list + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + + + with open("output.txt", "a") as f: + + print(f'Fold: {fold}', file=f) + print(f'Best threshold: {best_threshold}', file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + print(fold) + run_similarity_classifier(fold) diff --git a/relevant_class/similarity_classifier_desc/utils.py b/relevant_class/similarity_classifier_desc/utils.py new file mode 100644 index 0000000..98749be --- /dev/null +++ b/relevant_class/similarity_classifier_desc/utils.py @@ -0,0 +1,81 @@ +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + def make_embedding(self, batch_size=64): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) + # get last layer + embeddings = encoder_outputs.hidden_states[-1] + # get cls token embedding + cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size) + all_embeddings.append(cls_embeddings) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=1024): + device = 'cuda' + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + batch2.to(device) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + batch1_chunk.to(device) + # Expand batch1 chunk and entire batch2 for comparison + # batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + # batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + batch2_norms = batch2.norm(dim=1, keepdim=True) + + + # Compute cosine similarity for the chunk and store it in the final tensor + # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + # Compute cosine similarity by matrix multiplication and normalizing + sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8) + + # Store the results in the appropriate part of the final tensor + cos_sim[i:i + chunk_size] = sim_chunk + + return cos_sim \ No newline at end of file diff --git a/relevant_class/similarity_classifier_desc_unit/.gitignore b/relevant_class/similarity_classifier_desc_unit/.gitignore new file mode 100644 index 0000000..1bc943b --- /dev/null +++ b/relevant_class/similarity_classifier_desc_unit/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +exports +output.txt \ No newline at end of file diff --git a/relevant_class/similarity_classifier_desc_unit/README.md b/relevant_class/similarity_classifier_desc_unit/README.md new file mode 100644 index 0000000..371ad7d --- /dev/null +++ b/relevant_class/similarity_classifier_desc_unit/README.md @@ -0,0 +1,4 @@ +# one-class classification by similarity + +Purpose: using only Ship Domain attributes, we want to find if the data belongs +to MDM \ No newline at end of file diff --git a/relevant_class/similarity_classifier_desc_unit/run.py b/relevant_class/similarity_classifier_desc_unit/run.py new file mode 100644 index 0000000..45b3237 --- /dev/null +++ b/relevant_class/similarity_classifier_desc_unit/run.py @@ -0,0 +1,176 @@ +# %% +import pandas as pd +from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np +from tqdm import tqdm +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + +################################################## +# helper functions + + +# the following function takes in a full cos_sim_matrix +# condition_source: boolean selectors of the source embedding +# condition_target: boolean selectors of the target embedding +def find_closest(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top k maximum values along axis 1 + top_k = 3 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # note that top_k_indices is a nested list because of the 2d nature of the matrix + # the result is flipped + top_k_indices[0] = top_k_indices[0][::-1] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + + return top_k_indices, top_k_values + + + + +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = f"{desc}{unit}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + + + +def run_similarity_classifier(fold): + data_path = f'../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + checkpoint_directory = "../../train/classification_bert_complete_desc_unit" + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + test_embedder = Embedder(input_df=test_df) + test_embeds = test_embedder.make_embedding(checkpoint_path) + + def compute_top_k(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + + _, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + return top_k_values[0][0] + + + + # test embeds are inputs since we are looking back at train data + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=1024).cpu().numpy() + + + sim_list = [] + for select_idx in tqdm(test_df.index): + top_sim_value = compute_top_k(select_idx) + sim_list.append(top_sim_value) + + # analysis 1: using threshold to perform find-back prediction success + threshold_values = np.linspace(0.85, 1.00, 21) # test 20 values, 21 to get nice round numbers + best_threshold = 0 + best_f1 = 0 + for threshold in threshold_values: + predict_list = [ elem > threshold for elem in sim_list ] + + y_true = test_df['MDM'].to_list() + y_pred = predict_list + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + if f1 > best_f1: + best_threshold = threshold + best_f1 = f1 + + # just manually set best_threshold + # best_threshold = 0.90 + + # compute metrics again with best threshold + predict_list = [ elem > best_threshold for elem in sim_list ] + + # save + pred_labels = np.array(predict_list, dtype=bool) + + # append the mdm prediction to the test_df for analysis later + df_out = pd.DataFrame({ + 'p_mdm': pred_labels, + }) + df_out.to_csv(f"exports/result_group_{fold}.csv", index=False) + + + y_true = test_df['MDM'].to_list() + y_pred = predict_list + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + + + with open("output.txt", "a") as f: + + print(f'Fold: {fold}', file=f) + print(f'Best threshold: {best_threshold}', file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + print(fold) + run_similarity_classifier(fold) diff --git a/relevant_class/similarity_classifier_desc_unit/utils.py b/relevant_class/similarity_classifier_desc_unit/utils.py new file mode 100644 index 0000000..98749be --- /dev/null +++ b/relevant_class/similarity_classifier_desc_unit/utils.py @@ -0,0 +1,81 @@ +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + + model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + def make_embedding(self, batch_size=64): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) + # get last layer + embeddings = encoder_outputs.hidden_states[-1] + # get cls token embedding + cls_embeddings = embeddings[:, 0, :] # Shape: (batch_size, hidden_size) + all_embeddings.append(cls_embeddings) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=1024): + device = 'cuda' + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + batch2.to(device) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + batch1_chunk.to(device) + # Expand batch1 chunk and entire batch2 for comparison + # batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + # batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + batch2_norms = batch2.norm(dim=1, keepdim=True) + + + # Compute cosine similarity for the chunk and store it in the final tensor + # cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + # Compute cosine similarity by matrix multiplication and normalizing + sim_chunk = torch.mm(batch1_chunk, batch2.T) / (batch1_chunk.norm(dim=1, keepdim=True) * batch2_norms.T + 1e-8) + + # Store the results in the appropriate part of the final tensor + cos_sim[i:i + chunk_size] = sim_chunk + + return cos_sim \ No newline at end of file diff --git a/train/classification_bert_complete_desc/train.py b/train/classification_bert_complete_desc/train.py index 2654208..c490a15 100644 --- a/train/classification_bert_complete_desc/train.py +++ b/train/classification_bert_complete_desc/train.py @@ -176,9 +176,9 @@ def train(fold): logging_strategy="epoch", # save_strategy="epoch", load_best_model_at_end=False, - learning_rate=1e-5, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, diff --git a/train/classification_bert_complete_desc_unit/train.py b/train/classification_bert_complete_desc_unit/train.py index 86cb4f6..654790a 100644 --- a/train/classification_bert_complete_desc_unit/train.py +++ b/train/classification_bert_complete_desc_unit/train.py @@ -178,8 +178,8 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-5, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, diff --git a/train/frozen_t5_encoder/mapping_prediction/output.txt b/train/frozen_t5_encoder/mapping_prediction/output.txt index 52834ce..c2937ec 100644 --- a/train/frozen_t5_encoder/mapping_prediction/output.txt +++ b/train/frozen_t5_encoder/mapping_prediction/output.txt @@ -1,3 +1,6 @@ Accuracy for fold 1: 0.9342167534311405 Accuracy for fold 2: 0.883177570093458 +Accuracy for fold 3: 0.963855421686747 +Accuracy for fold 4: 0.9705042816365367 +Accuracy for fold 5: 0.9051763628034815 diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt b/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt index 08326c5..9fa6057 100644 --- a/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/output.txt @@ -1,2 +1,6 @@ -Accuracy for fold 1: 0.9398958826313298 +Accuracy for fold 1: 0.9242782773308093 +Accuracy for fold 2: 0.9126168224299065 +Accuracy for fold 3: 0.9643574297188755 +Accuracy for fold 4: 0.9595623215984777 +Accuracy for fold 5: 0.8950984883188273 diff --git a/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py b/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py index 6bb0650..29e45f8 100644 --- a/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py +++ b/train/hybrid_t5_complete_desc_unit/mapping_prediction/predict.py @@ -70,5 +70,5 @@ def infer_and_select(fold): with open("output.txt", "w") as f: print('', file=f) -for fold in [1]: +for fold in [1,2,3,4,5]: infer_and_select(fold) diff --git a/train/hybrid_t5_complete_desc_unit/train_decoder.py b/train/hybrid_t5_complete_desc_unit/train_decoder.py index 31a3ee4..cf32071 100644 --- a/train/hybrid_t5_complete_desc_unit/train_decoder.py +++ b/train/hybrid_t5_complete_desc_unit/train_decoder.py @@ -230,7 +230,7 @@ def train(fold): trainer.train() # execute training -for fold in [1]: +for fold in [1,2,3,4,5]: print(fold) train(fold) diff --git a/train/hybrid_t5_complete_desc_unit/train_encoder.py b/train/hybrid_t5_complete_desc_unit/train_encoder.py index 5fff07d..13f673b 100644 --- a/train/hybrid_t5_complete_desc_unit/train_encoder.py +++ b/train/hybrid_t5_complete_desc_unit/train_encoder.py @@ -190,8 +190,8 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-3, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, # t5_classify = T5Model.from_pretrained(prev_checkpoint) weight_decay=0.01, @@ -221,7 +221,7 @@ def train(fold): trainer.train() # execute training -for fold in [1]: +for fold in [1,2,3,4,5]: print(fold) train(fold) diff --git a/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt index 344ffc5..d360b8d 100644 --- a/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt +++ b/train/hybrid_t5_pattern_desc_unit/mapping_prediction/output.txt @@ -1,6 +1,6 @@ -Accuracy for fold 1: 0.9337434926644581 -Accuracy for fold 2: 0.914018691588785 -Accuracy for fold 3: 0.9623493975903614 -Accuracy for fold 4: 0.9738344433872502 -Accuracy for fold 5: 0.9042601923957856 +Accuracy for fold 1: 0.9394226218646474 +Accuracy for fold 2: 0.9107476635514019 +Accuracy for fold 3: 0.9548192771084337 +Accuracy for fold 4: 0.972882968601332 +Accuracy for fold 5: 0.8996793403573065 diff --git a/train/hybrid_t5_pattern_desc_unit/train_decoder.py b/train/hybrid_t5_pattern_desc_unit/train_decoder.py index ed95aa6..c3b0094 100644 --- a/train/hybrid_t5_pattern_desc_unit/train_decoder.py +++ b/train/hybrid_t5_pattern_desc_unit/train_decoder.py @@ -228,7 +228,7 @@ def train(fold): trainer.train() # execute training -for fold in [1]: +for fold in [1,2,3,4,5]: print(fold) train(fold) diff --git a/train/hybrid_t5_pattern_desc_unit/train_encoder.py b/train/hybrid_t5_pattern_desc_unit/train_encoder.py index eb31879..94770ad 100644 --- a/train/hybrid_t5_pattern_desc_unit/train_encoder.py +++ b/train/hybrid_t5_pattern_desc_unit/train_encoder.py @@ -189,13 +189,13 @@ def train(fold): # save_strategy="epoch", load_best_model_at_end=False, learning_rate=1e-3, - per_device_train_batch_size=128, - per_device_eval_batch_size=128, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, auto_find_batch_size=False, ddp_find_unused_parameters=False, weight_decay=0.01, save_total_limit=1, - num_train_epochs=40, + num_train_epochs=80, bf16=True, push_to_hub=False, remove_unused_columns=False, @@ -220,7 +220,7 @@ def train(fold): trainer.train() # execute training -for fold in [1]: +for fold in [1,2,3,4,5]: print(fold) train(fold) diff --git a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py index a3a8ea5..76212fa 100644 --- a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py +++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py @@ -13,6 +13,7 @@ def infer_and_select(fold): # import test data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) + df = df[df['MDM']].reset_index(drop=True) # get target data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" diff --git a/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py b/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py index 29e45f8..0bb37c5 100644 --- a/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py +++ b/train/modified_t5_decoder_4_layers/mapping_prediction/predict.py @@ -13,7 +13,7 @@ def infer_and_select(fold): # import test data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) - df = df[df['MDM']].reset_index(drop=True) + # df = df[df['MDM']].reset_index(drop=True) # get target data data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" diff --git a/train/random_t5_encoder/mapping_prediction/output.txt b/train/random_t5_encoder/mapping_prediction/output.txt index f60f90f..9d8d7a8 100644 --- a/train/random_t5_encoder/mapping_prediction/output.txt +++ b/train/random_t5_encoder/mapping_prediction/output.txt @@ -1,2 +1,6 @@ -Accuracy for fold 1: 0.9342167534311405 +Accuracy for fold 1: 0.0 +Accuracy for fold 2: 0.0 +Accuracy for fold 3: 0.0 +Accuracy for fold 4: 0.0 +Accuracy for fold 5: 0.0 diff --git a/train/train.bash b/train/train.bash index 79bfe1b..f95baab 100644 --- a/train/train.bash +++ b/train/train.bash @@ -1,16 +1,27 @@ #!/bin/bash -cd classification_bert_complete_desc -micromamba run -n hug accelerate launch train.py +cd hybrid_t5_complete_desc_unit +micromamba run -n hug accelerate launch train_encoder.py +micromamba run -n hug accelerate launch train_decoder.py cd .. -cd classification_bert_complete_desc_unit -micromamba run -n hug accelerate launch train.py +cd hybrid_t5_pattern_desc_unit +micromamba run -n hug accelerate launch train_encoder.py +micromamba run -n hug accelerate launch train_decoder.py cd .. -cd classification_bert_complete_desc_unit_name -micromamba run -n hug accelerate launch train.py -cd .. + +# cd classification_bert_complete_desc +# micromamba run -n hug accelerate launch train.py +# cd .. + +# cd classification_bert_complete_desc_unit +# micromamba run -n hug accelerate launch train.py +# cd .. + +# cd classification_bert_complete_desc_unit_name +# micromamba run -n hug accelerate launch train.py +# cd .. # cd mapping_t5_complete_desc # micromamba run -n hug accelerate launch train.py