From 1f3970459fdae1530cb3b3318cc54269d90f5755 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Wed, 20 Nov 2024 15:07:47 +0900 Subject: [PATCH] Chore: re-organized train folders to have standardized naming schemes Feat: introduced BERT-based binary classification --- analysis/categories/label_print.py | 8 +- analysis/ship_data_list/basic_eda.py | 13 + analysis/ship_data_variations/eda.py | 15 ++ .../abbreviations/replacement_dict.py | 140 +++++----- post_process/binary_classifier/.gitignore | 2 + .../classification_prediction/output.txt | 31 +++ .../classification_prediction/predict.py | 214 ++++++++++++++++ post_process/binary_classifier/train.py | 210 +++++++++++++++ post_process/selection_with_pattern/run.py | 14 +- .../{ood => similarity_classifier}/.gitignore | 0 post_process/similarity_classifier/README.md | 4 + post_process/similarity_classifier/run.py | 134 ++++++++++ .../similarity_with_find_back.py} | 6 +- .../{ood => similarity_classifier}/utils.py | 0 train/README.md | 20 +- .../.gitignore | 0 .../classification_prediction/output.txt | 31 +++ .../classification_prediction/predict.py | 241 ++++++++++++++++++ .../train.py | 216 ++++++++++++++++ .../.gitignore | 0 .../classification_prediction/output.txt | 31 +++ .../classification_prediction/predict.py | 241 ++++++++++++++++++ .../train.py | 217 ++++++++++++++++ .../.gitignore | 2 + .../classification_prediction/predict.py | 0 .../train.py | 0 .../.gitignore | 2 + .../classification_prediction/predict.py | 0 .../train.py | 0 .../.gitignore | 0 .../train.py | 0 .../utils.py | 0 .../.gitignore | 0 .../train.py | 0 .../utils.py | 0 .../.gitignore | 0 .../train.py | 0 .../utils.py | 0 .../.gitignore | 0 .../mapping_prediction/.gitignore | 0 .../mapping_prediction/inference.py | 0 .../mapping_prediction/predict.py | 0 .../train.py | 0 .../.gitignore | 0 .../mapping_prediction/.gitignore | 0 .../mapping_prediction/inference.py | 0 .../mapping_prediction/output.txt | 0 .../mapping_prediction/predict.py | 0 .../train.py | 0 translation/README.md | 2 +- 50 files changed, 1710 insertions(+), 84 deletions(-) create mode 100644 analysis/ship_data_list/basic_eda.py create mode 100644 analysis/ship_data_variations/eda.py create mode 100644 post_process/binary_classifier/.gitignore create mode 100644 post_process/binary_classifier/classification_prediction/output.txt create mode 100644 post_process/binary_classifier/classification_prediction/predict.py create mode 100644 post_process/binary_classifier/train.py rename post_process/{ood => similarity_classifier}/.gitignore (100%) create mode 100644 post_process/similarity_classifier/README.md create mode 100644 post_process/similarity_classifier/run.py rename post_process/{ood/similarity.py => similarity_classifier/similarity_with_find_back.py} (98%) rename post_process/{ood => similarity_classifier}/utils.py (100%) rename train/{classification_bert => classification_bert_complete_desc}/.gitignore (100%) create mode 100644 train/classification_bert_complete_desc/classification_prediction/output.txt create mode 100644 train/classification_bert_complete_desc/classification_prediction/predict.py create mode 100644 train/classification_bert_complete_desc/train.py rename train/{classification_bert_desc => classification_bert_complete_desc_unit}/.gitignore (100%) create mode 100644 train/classification_bert_complete_desc_unit/classification_prediction/output.txt create mode 100644 train/classification_bert_complete_desc_unit/classification_prediction/predict.py create mode 100644 train/classification_bert_complete_desc_unit/train.py create mode 100644 train/classification_bert_pattern_desc/.gitignore rename train/{classification_bert => classification_bert_pattern_desc}/classification_prediction/predict.py (100%) rename train/{classification_bert_desc => classification_bert_pattern_desc}/train.py (100%) create mode 100644 train/classification_bert_pattern_desc_unit/.gitignore rename train/{classification_bert_desc => classification_bert_pattern_desc_unit}/classification_prediction/predict.py (100%) rename train/{classification_bert => classification_bert_pattern_desc_unit}/train.py (100%) rename train/{classification_all => classification_t5_complete}/.gitignore (100%) rename train/{classification_all => classification_t5_complete}/train.py (100%) rename train/{classification_all => classification_t5_complete}/utils.py (100%) rename train/{classification_all_with_contrastive => classification_t5_complete_with_constrastive}/.gitignore (100%) rename train/{classification_all_with_contrastive => classification_t5_complete_with_constrastive}/train.py (100%) rename train/{classification_all_with_contrastive => classification_t5_complete_with_constrastive}/utils.py (100%) rename train/{classification_mdm_with_contrastive => classification_t5_mdm_with_contrastive}/.gitignore (100%) rename train/{classification_mdm_with_contrastive => classification_t5_mdm_with_contrastive}/train.py (100%) rename train/{classification_mdm_with_contrastive => classification_t5_mdm_with_contrastive}/utils.py (100%) rename train/{mapping_pattern => mapping_t5_complete_desc_unit}/.gitignore (100%) rename train/{mapping_pattern => mapping_t5_complete_desc_unit}/mapping_prediction/.gitignore (100%) rename train/{mapping_with_unit => mapping_t5_complete_desc_unit}/mapping_prediction/inference.py (100%) rename train/{mapping_with_unit => mapping_t5_complete_desc_unit}/mapping_prediction/predict.py (100%) rename train/{mapping_with_unit => mapping_t5_complete_desc_unit}/train.py (100%) rename train/{mapping_with_unit => mapping_t5_pattern_desc_unit}/.gitignore (100%) rename train/{mapping_with_unit => mapping_t5_pattern_desc_unit}/mapping_prediction/.gitignore (100%) rename train/{mapping_pattern => mapping_t5_pattern_desc_unit}/mapping_prediction/inference.py (100%) rename train/{mapping_pattern => mapping_t5_pattern_desc_unit}/mapping_prediction/output.txt (100%) rename train/{mapping_pattern => mapping_t5_pattern_desc_unit}/mapping_prediction/predict.py (100%) rename train/{mapping_pattern => mapping_t5_pattern_desc_unit}/train.py (100%) diff --git a/analysis/categories/label_print.py b/analysis/categories/label_print.py index 7d649de..a69711c 100644 --- a/analysis/categories/label_print.py +++ b/analysis/categories/label_print.py @@ -8,5 +8,11 @@ mdm_list = sorted(list((set(full_df['pattern'])))) # %% -mdm_list +len(mdm_list) +# %% +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +tp_list = sorted(list(set(thing_property))) +# %% +len(tp_list) # %% diff --git a/analysis/ship_data_list/basic_eda.py b/analysis/ship_data_list/basic_eda.py new file mode 100644 index 0000000..daa4224 --- /dev/null +++ b/analysis/ship_data_list/basic_eda.py @@ -0,0 +1,13 @@ +# %% +import pandas as pd + +# %% +data_path = '../../data_import/exports/raw_data.csv' +df = pd.read_csv(data_path) + +# %% +df + +# %% +set(df['signal_type']) +# %% diff --git a/analysis/ship_data_variations/eda.py b/analysis/ship_data_variations/eda.py new file mode 100644 index 0000000..1e1ef7d --- /dev/null +++ b/analysis/ship_data_variations/eda.py @@ -0,0 +1,15 @@ +# %% +import pandas as pd + +# %% +data_path = '../../data_import/exports/raw_data.csv' +df = pd.read_csv(data_path) + +# %% +df = df[df['MDM']].reset_index(drop=True) + +# %% +set(df['pattern']) +# %% +set(df[df['pattern'] == 'GeneratorEngine# Power']['tag_description'].to_list()) +# %% diff --git a/data_preprocess/abbreviations/replacement_dict.py b/data_preprocess/abbreviations/replacement_dict.py index 18f03fa..c6af452 100644 --- a/data_preprocess/abbreviations/replacement_dict.py +++ b/data_preprocess/abbreviations/replacement_dict.py @@ -1,106 +1,106 @@ # substitution mapping for descriptions # Abbreviations and their replacements desc_replacement_dict = { - r'\bLIST\b\b': 'LIST', - r'\bList\b\b': 'LIST', + r'\bLIST\b': 'LIST', + r'\bList\b': 'LIST', r'\bEXH\.\b': 'EXHAUST', - r'\bEXH\b\b': 'EXHAUST', + r'\bEXH\b': 'EXHAUST', r'\bEXHAUST\.\b': 'EXHAUST', - r'\bExhaust\b\b': 'EXHAUST', - r'\bEXHAUST\b\b': 'EXHAUST', + r'\bExhaust\b': 'EXHAUST', + r'\bEXHAUST\b': 'EXHAUST', r'\bTEMP\.\b': 'TEMPERATURE', - r'\bTEMP\b\b': 'TEMPERATURE', + r'\bTEMP\b': 'TEMPERATURE', r'\bTEMPERATURE\.\b': 'TEMPERATURE', - r'\bTEMPERATURE\b\b': 'TEMPERATURE', + r'\bTEMPERATURE\b': 'TEMPERATURE', r'\bW\.\b': 'WATER', - r'\bWATER\b\b': 'WATER', - r'\bCW\b\b': 'COOLING WATER', + r'\bWATER\b': 'WATER', + r'\bCW\b': 'COOLING WATER', r'\bCYL\.\b': 'CYLINDER', - r'\bCyl\b\b': 'CYLINDER', + r'\bCyl\b': 'CYLINDER', r'\bcyl\.\b': 'CYLINDER', - r'\bCYL\b\b': 'CYLINDER', + r'\bCYL\b': 'CYLINDER', r'\bCYL(?=\d|\W|$)\b': 'CYLINDER', - r'\bcylinder\b\b': 'CYLINDER', - r'\bCYLINDER\b\b': 'CYLINDER', + r'\bcylinder\b': 'CYLINDER', + r'\bCYLINDER\b': 'CYLINDER', r'\bCOOL\.\b': 'COOLING', r'\bcool\.\b': 'COOLING', - r'\bcooling\b\b': 'COOLING', - r'\bCOOLING\b\b': 'COOLING', - r'\bcooler\b\b': 'COOLER', - r'\bCOOLER\b\b': 'COOLER', + r'\bcooling\b': 'COOLING', + r'\bCOOLING\b': 'COOLING', + r'\bcooler\b': 'COOLER', + r'\bCOOLER\b': 'COOLER', r'\bScav\.\b': 'SCAVENGE', r'\bSCAV\.\b': 'SCAVENGE', r'\bINL\.\b': 'INLET', - r'\binlet\b\b': 'INLET', - r'\bINLET\b\b': 'INLET', + r'\binlet\b': 'INLET', + r'\bINLET\b': 'INLET', r'\bOUT\.\b': 'OUTLET', r'\bOUTL\.\b': 'OUTLET', - r'\boutlet\b\b': 'OUTLET', - r'\bOUTLET\b\b': 'OUTLET', + r'\boutlet\b': 'OUTLET', + r'\bOUTLET\b': 'OUTLET', # bunker tank r'\bBK\b': 'BUNKER', r'\bTK\b': 'TANK', # pressure - r'\bPRESS\b\b': 'PRESSURE', + r'\bPRESS\b': 'PRESSURE', r'\bPRESS\.\b': 'PRESSURE', r'\bPress\.\b': 'PRESSURE', - r'\bpressure\b\b': 'PRESSURE', - r'\bPRESSURE\b\b': 'PRESSURE', + r'\bpressure\b': 'PRESSURE', + r'\bPRESSURE\b': 'PRESSURE', # this is a special replacement - it is safe to replace PRS w/o checks r'PRS\b': 'PRESSURE', - r'\bCLR\b\b': 'CLEAR', + r'\bCLR\b': 'CLEAR', r'\bENG\.\b': 'ENGINE', - r'\bENG\b\b': 'ENGINE', - r'\bENGINE\b\b': 'ENGINE', - r'\bEngine speed\b\b': 'ENGINE SPEED', - r'\bEngine running\b\b': 'ENGINE RUNNING', - r'\bEngine RPM pickup\b\b': 'ENGINE RPM PICKUP', - r'\bEngine room\b\b': 'ENGINE ROOM', + r'\bENG\b': 'ENGINE', + r'\bENGINE\b': 'ENGINE', + r'\bEngine speed\b': 'ENGINE SPEED', + r'\bEngine running\b': 'ENGINE RUNNING', + r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP', + r'\bEngine room\b': 'ENGINE ROOM', # main engine r'\bM/E\b': 'MAIN_ENGINE', r'\bM_E\b': 'MAIN_ENGINE', r'\bME(?=\d|\W|$)\b': 'MAIN_ENGINE', - r'\bMAIN ENGINE\b\b': 'MAIN_ENGINE', - r'\bGen\b\b': 'GENERATOR_ENGINE', + r'\bMAIN ENGINE\b': 'MAIN_ENGINE', + r'\bGen\b': 'GENERATOR_ENGINE', # ensure that we substitute only for terms where following GE is num or special r'\bGE(?=\d|\W|$)\b': 'GENERATOR_ENGINE', r'\bG/E\b': 'GENERATOR_ENGINE', r'\bG_E\b': 'GENERATOR_ENGINE', r'\bDG\b': 'GENERATOR_ENGINE', - r'\bD/G\b\b': 'GENERATOR_ENGINE', + r'\bD/G\b': 'GENERATOR_ENGINE', r'\bGEN\.\b': 'GENERATOR_ENGINE', - r'\bGENERATOR ENGINE\B\b': 'GENERATOR_ENGINE', - r'\b(\d+)MGE\b\b': r'NO\1 GENERATOR_ENGINE', - r'\bGEN\.WIND\.TEMP\b\b': 'GENERATOR WINDING TEMPERATURE', - r'\bENGINE ROOM\b\b': 'ENGINE ROOM', - r'\bE/R\b\b': 'ENGINE ROOM', - r'\bFLTR\b\b': 'FILTER', + r'\bGENERATOR ENGINE\b': 'GENERATOR_ENGINE', + r'\b(\d+)MGE\b': r'NO\1 GENERATOR_ENGINE', + r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE', + r'\bENGINE ROOM\b': 'ENGINE ROOM', + r'\bE/R\b': 'ENGINE ROOM', + r'\bFLTR\b': 'FILTER', # marine gas oil - r'\bM\.G\.O\b\b': 'MARINE GAS OIL', - r'\bMGO\b\b': 'MARINE GAS OIL', - r'\bMDO\b\b': 'MARINE DIESEL OIL', + r'\bM\.G\.O\b': 'MARINE GAS OIL', + r'\bMGO\b': 'MARINE GAS OIL', + r'\bMDO\b': 'MARINE DIESEL OIL', # light fuel oil - r'\bL\.F\.O\b\b': 'LIGHT FUEL OIL', - r'\bLFO\b\b': 'LIGHT FUEL OIL', + r'\bL\.F\.O\b': 'LIGHT FUEL OIL', + r'\bLFO\b': 'LIGHT FUEL OIL', # heavy fuel oil - r'\bHFO\b\b': 'HEAVY FUEL OIL', - r'\bH\.F\.O\b\b': 'HEAVY FUEL OIL', + r'\bHFO\b': 'HEAVY FUEL OIL', + r'\bH\.F\.O\b': 'HEAVY FUEL OIL', # for remaining fuel oil that couldn't be substituted - r'\bF\.O\b\b': 'FUEL OIL', - r'\bFO\b\b': 'FUEL OIL', + r'\bF\.O\b': 'FUEL OIL', + r'\bFO\b': 'FUEL OIL', # lubricant r'\bLUB\.\b': 'LUBRICANT', # lubricating oil - r'\bL\.O\b\b': 'LUBRICATING OIL', - r'\bLO\b\b': 'LUBRICATING OIL', + r'\bL\.O\b': 'LUBRICATING OIL', + r'\bLO\b': 'LUBRICATING OIL', # lubricating oil pressure - r'\bLO_PRESS\b\b': 'LUBRICATING OIL PRESSURE', - r'\bLO_PRESSURE\b\b': 'LUBRICATING OIL PRESSURE', + r'\bLO_PRESS\b': 'LUBRICATING OIL PRESSURE', + r'\bLO_PRESSURE\b': 'LUBRICATING OIL PRESSURE', # temperature - r'\bL\.T\b\b': 'LOW TEMPERATURE', - r'\bLT\b\b': 'LOW TEMPERATURE', - r'\bH\.T\b\b': 'HIGH TEMPERATURE', - r'\bHT\b\b': 'HIGH TEMPERATURE', + r'\bL\.T\b': 'LOW TEMPERATURE', + r'\bLT\b': 'LOW TEMPERATURE', + r'\bH\.T\b': 'HIGH TEMPERATURE', + r'\bHT\b': 'HIGH TEMPERATURE', # auxiliary boiler # replace these first before replacing AUXILIARY only r'\bAUX\.BOILER\b': 'AUXILIARY BOILER', @@ -108,27 +108,27 @@ desc_replacement_dict = { r'\bAUX BLR\b': 'AUXILIARY BOILER', r'\bAUX\.\b': 'AUXILIARY ', # composite boiler - r'\bCOMP\. BOILER\b\b': 'COMPOSITE BOILER', - r'\bCOMP\.BOILER\b\b': 'COMPOSITE BOILER', - r'\bCOMP BOILER\b\b': 'COMPOSITE BOILER', + r'\bCOMP\. BOILER\b': 'COMPOSITE BOILER', + r'\bCOMP\.BOILER\b': 'COMPOSITE BOILER', + r'\bCOMP BOILER\b': 'COMPOSITE BOILER', r'\bWIND\.\b': 'WINDING', - r'\bWINDING\b\b': 'WINDING', - r'\bC\.S\.W\b\b': 'CSW', - r'\bCSW\b\b': 'CSW', + r'\bWINDING\b': 'WINDING', + r'\bC\.S\.W\b': 'CSW', + r'\bCSW\b': 'CSW', r'\bVLOT\.\b': 'VOLTAGE', - r'\bVOLTAGE\b\b': 'VOLTAGE', + r'\bVOLTAGE\b': 'VOLTAGE', r'\bVOLT\.\b': 'VOLTAGE', r'\bFREQ\.\b': 'FREQUENCY', - r'\bFREQUENCY\b\b': 'FREQUENCY', + r'\bFREQUENCY\b': 'FREQUENCY', r'\bCURR\.\b': 'CURRENT', - r'\bCURRENT\b\b': 'CURRENT', - r'\bTCA\b\b': 'TURBOCHARGER', - r'\bTCB\b\b': 'TURBOCHARGER', + r'\bCURRENT\b': 'CURRENT', + r'\bTCA\b': 'TURBOCHARGER', + r'\bTCB\b': 'TURBOCHARGER', r'\bT/C\b': 'TURBOCHARGER', r'\bT_C\b': 'TURBOCHARGER', r'\bTC(?=\d|\W|$)\b': 'TURBOCHARGER', - r'\bTURBOCHAGER\b\b': 'TURBOCHARGER', - r'\bTURBOCHARGER\b\b': 'TURBOCHARGER', + r'\bTURBOCHAGER\b': 'TURBOCHARGER', + r'\bTURBOCHARGER\b': 'TURBOCHARGER', # misc spelling errors r'\bOPERATOIN\b': 'OPERATION', # wrongly attached terms diff --git a/post_process/binary_classifier/.gitignore b/post_process/binary_classifier/.gitignore new file mode 100644 index 0000000..d943a39 --- /dev/null +++ b/post_process/binary_classifier/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log \ No newline at end of file diff --git a/post_process/binary_classifier/classification_prediction/output.txt b/post_process/binary_classifier/classification_prediction/output.txt new file mode 100644 index 0000000..d47704c --- /dev/null +++ b/post_process/binary_classifier/classification_prediction/output.txt @@ -0,0 +1,31 @@ + +******************************************************************************** +Fold: 1 +Accuracy: 0.95342 +F1 Score: 0.91344 +Precision: 0.91643 +Recall: 0.91052 +******************************************************************************** +Fold: 2 +Accuracy: 0.95402 +F1 Score: 0.92950 +Precision: 0.92122 +Recall: 0.93848 +******************************************************************************** +Fold: 3 +Accuracy: 0.95200 +F1 Score: 0.92726 +Precision: 0.91825 +Recall: 0.93712 +******************************************************************************** +Fold: 4 +Accuracy: 0.96473 +F1 Score: 0.92708 +Precision: 0.91566 +Recall: 0.93950 +******************************************************************************** +Fold: 5 +Accuracy: 0.95605 +F1 Score: 0.92244 +Precision: 0.91755 +Recall: 0.92754 diff --git a/post_process/binary_classifier/classification_prediction/predict.py b/post_process/binary_classifier/classification_prediction/predict.py new file mode 100644 index 0000000..3947fe1 --- /dev/null +++ b/post_process/binary_classifier/classification_prediction/predict.py @@ -0,0 +1,214 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + +# %% + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + in_mdm_label = int(row['MDM']) + element = { + 'text' : f"{desc}{unit}", + 'label': in_mdm_label, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 64 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=2) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + BATCH_SIZE = 64 + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + print(f'Fold: {fold}', file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + test(fold) diff --git a/post_process/binary_classifier/train.py b/post_process/binary_classifier/train.py new file mode 100644 index 0000000..859c076 --- /dev/null +++ b/post_process/binary_classifier/train.py @@ -0,0 +1,210 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +# data_path = '../../data_import/exports/data_mapping_mdm.csv' +# full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property + + +# %% +id2label = {0: False, 1: True} +label2id = {False: 0, True: 1} + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + in_mdm_label = int(row['MDM']) + element = { + 'text' : f"{desc}{unit}", + 'label': in_mdm_label, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + # data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + + # reconstruct full training data with non-mdm data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + ships_list = list(set(test_df['ships_idx'])) + data_path = '../../data_preprocess/exports/preprocessed_data.csv' + full_df = pd.read_csv(data_path, skipinitialspace=True) + train_df = full_df[~full_df['ships_idx'].isin(ships_list)] + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "distilbert/distilbert-base-uncased" + # model_checkpoint = 'google-bert/bert-base-uncased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=2) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/post_process/selection_with_pattern/run.py b/post_process/selection_with_pattern/run.py index c5285fb..958aefa 100644 --- a/post_process/selection_with_pattern/run.py +++ b/post_process/selection_with_pattern/run.py @@ -10,7 +10,7 @@ from fuzzywuzzy import fuzz ################## # global parameters DIAGNOSTIC = False -THRESHOLD = 0.85 +THRESHOLD = 0.90 FUZZY_SIM_THRESHOLD=95 checkpoint_directory = "../../train/classification_bert_desc" @@ -264,9 +264,9 @@ def run_selection(fold): # Compute metrics accuracy = accuracy_score(y_true, y_pred) - f1 = f1_score(y_true, y_pred, average='macro') - precision = precision_score(y_true, y_pred, average='macro') - recall = recall_score(y_true, y_pred, average='macro') + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) # Print the results print(f'Accuracy: {accuracy:.5f}') @@ -287,9 +287,9 @@ def run_selection(fold): # Compute metrics accuracy = accuracy_score(y_true, y_pred) - f1 = f1_score(y_true, y_pred, average='macro') - precision = precision_score(y_true, y_pred, average='macro') - recall = recall_score(y_true, y_pred, average='macro') + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) # Print the results print(f'Accuracy: {accuracy:.5f}') diff --git a/post_process/ood/.gitignore b/post_process/similarity_classifier/.gitignore similarity index 100% rename from post_process/ood/.gitignore rename to post_process/similarity_classifier/.gitignore diff --git a/post_process/similarity_classifier/README.md b/post_process/similarity_classifier/README.md new file mode 100644 index 0000000..371ad7d --- /dev/null +++ b/post_process/similarity_classifier/README.md @@ -0,0 +1,4 @@ +# one-class classification by similarity + +Purpose: using only Ship Domain attributes, we want to find if the data belongs +to MDM \ No newline at end of file diff --git a/post_process/similarity_classifier/run.py b/post_process/similarity_classifier/run.py new file mode 100644 index 0000000..3f7faeb --- /dev/null +++ b/post_process/similarity_classifier/run.py @@ -0,0 +1,134 @@ +# %% +import pandas as pd +from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np +from tqdm import tqdm +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + +################################################## +# helper functions + + +# the following function takes in a full cos_sim_matrix +# condition_source: boolean selectors of the source embedding +# condition_target: boolean selectors of the target embedding +def find_closest(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top k maximum values along axis 1 + top_k = 3 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # note that top_k_indices is a nested list because of the 2d nature of the matrix + # the result is flipped + top_k_indices[0] = top_k_indices[0][::-1] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + + return top_k_indices, top_k_values + + + + +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + element = f"{desc}{unit}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + + + +def run_similarity_classifier(fold): + data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv' + test_df = pd.read_csv(data_path, skipinitialspace=True) + + + + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + checkpoint_directory = "../../train/classification_bert_complete_desc_unit" + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + train_embedder = Embedder(input_df=train_df) + train_embeds = train_embedder.make_embedding(checkpoint_path) + + test_embedder = Embedder(input_df=test_df) + test_embeds = test_embedder.make_embedding(checkpoint_path) + + def compute_top_k(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + + _, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + return top_k_values[0][0] + + + + # test embeds are inputs since we are looking back at train data + cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=1024).cpu().numpy() + + + sim_list = [] + for select_idx in tqdm(test_df.index): + top_sim_value = compute_top_k(select_idx) + sim_list.append(top_sim_value) + + # analysis 1: using threshold to perform find-back prediction success + threshold = 0.90 + predict_list = [ elem > threshold for elem in sim_list ] + + y_true = test_df['MDM'].to_list() + y_pred = predict_list + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + + # Print the results + print(f'Accuracy: {accuracy:.5f}') + print(f'F1 Score: {f1:.5f}') + print(f'Precision: {precision:.5f}') + print(f'Recall: {recall:.5f}') + + +# %% + +for fold in [1,2,3,4,5]: + run_similarity_classifier(fold) diff --git a/post_process/ood/similarity.py b/post_process/similarity_classifier/similarity_with_find_back.py similarity index 98% rename from post_process/ood/similarity.py rename to post_process/similarity_classifier/similarity_with_find_back.py index a11a16b..e54c24c 100644 --- a/post_process/ood/similarity.py +++ b/post_process/similarity_classifier/similarity_with_find_back.py @@ -44,7 +44,7 @@ class Embedder(): data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" train_df = pd.read_csv(data_path, skipinitialspace=True) -checkpoint_directory = "../../train/classification_bert" +checkpoint_directory = "../../train/classification_bert_complete_desc_unit" directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') # Use glob to find matching paths # path is usually checkpoint_fold_1/checkpoint- @@ -74,7 +74,7 @@ def find_closest(cos_sim_matrix, condition_source, condition_target): # except we are subsetting 2D matrix (row, column) subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] # we select top k here - # Get the indices of the top 5 maximum values along axis 1 + # Get the indices of the top k maximum values along axis 1 top_k = 3 top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values # note that top_k_indices is a nested list because of the 2d nature of the matrix @@ -168,7 +168,7 @@ for select_idx in tqdm(test_df.index): # analysis 1: using threshold to perform find-back prediction success # %% -threshold = 0.9 +threshold = 0.95 predict_list = [ elem > threshold for elem in sim_list ] # %% diff --git a/post_process/ood/utils.py b/post_process/similarity_classifier/utils.py similarity index 100% rename from post_process/ood/utils.py rename to post_process/similarity_classifier/utils.py diff --git a/train/README.md b/train/README.md index 8690a35..6ecb6f4 100644 --- a/train/README.md +++ b/train/README.md @@ -8,5 +8,21 @@ Each folder contains a training variation. After training, each folder contains the checkpoint files for each fold. -`mapping` directory contains the code to run the model on test data and also -produce the csv outputs. \ No newline at end of file +The folders are named with the following convention: + +``\_``\_``\_`` + +e.g. + +"classification_bert_complete_desc_unit", + +which means: folder to perform classification using the bert model, predicting +for the complete thing+property output using description and unit + +To train, just run `python train.py` + +The inference code is within a folder `classification_prediction` or +`mapping_prediction`. + +Note: the classification\_t5 folders are depracated in favor of the BERT-based +classification models. \ No newline at end of file diff --git a/train/classification_bert/.gitignore b/train/classification_bert_complete_desc/.gitignore similarity index 100% rename from train/classification_bert/.gitignore rename to train/classification_bert_complete_desc/.gitignore diff --git a/train/classification_bert_complete_desc/classification_prediction/output.txt b/train/classification_bert_complete_desc/classification_prediction/output.txt new file mode 100644 index 0000000..f37a1a6 --- /dev/null +++ b/train/classification_bert_complete_desc/classification_prediction/output.txt @@ -0,0 +1,31 @@ + +******************************************************************************** +Fold: 1 +Accuracy: 0.76337 +F1 Score: 0.37980 +Precision: 0.36508 +Recall: 0.41523 +******************************************************************************** +Fold: 2 +Accuracy: 0.77430 +F1 Score: 0.40473 +Precision: 0.39528 +Recall: 0.43303 +******************************************************************************** +Fold: 3 +Accuracy: 0.77259 +F1 Score: 0.39538 +Precision: 0.37761 +Recall: 0.43633 +******************************************************************************** +Fold: 4 +Accuracy: 0.77545 +F1 Score: 0.39792 +Precision: 0.38636 +Recall: 0.43003 +******************************************************************************** +Fold: 5 +Accuracy: 0.74897 +F1 Score: 0.38827 +Precision: 0.37680 +Recall: 0.42382 diff --git a/train/classification_bert_complete_desc/classification_prediction/predict.py b/train/classification_bert_complete_desc/classification_prediction/predict.py new file mode 100644 index 0000000..b9061ca --- /dev/null +++ b/train/classification_bert_complete_desc/classification_prediction/predict.py @@ -0,0 +1,241 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + # unit = f"{row['unit']}" + + pattern = f"{row['thing'] + row['property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold, mdm_list): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + # we only use the mdm subset + test_df = test_df[test_df['MDM']].reset_index(drop=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold, mdm_list) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 64 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + BATCH_SIZE = 64 + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + print(f'Fold: {fold}', file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + test(fold) diff --git a/train/classification_bert_complete_desc/train.py b/train/classification_bert_complete_desc/train.py new file mode 100644 index 0000000..7ef25d2 --- /dev/null +++ b/train/classification_bert_complete_desc/train.py @@ -0,0 +1,216 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + pattern = f"{row['thing'] + row['property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + element = { + 'text' : f"{desc}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + # model_checkpoint = "distilbert/distilbert-base-uncased" + model_checkpoint = 'google-bert/bert-base-uncased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/train/classification_bert_desc/.gitignore b/train/classification_bert_complete_desc_unit/.gitignore similarity index 100% rename from train/classification_bert_desc/.gitignore rename to train/classification_bert_complete_desc_unit/.gitignore diff --git a/train/classification_bert_complete_desc_unit/classification_prediction/output.txt b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt new file mode 100644 index 0000000..14e56dd --- /dev/null +++ b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt @@ -0,0 +1,31 @@ + +******************************************************************************** +Fold: 1 +Accuracy: 0.77946 +F1 Score: 0.40686 +Precision: 0.39833 +Recall: 0.43814 +******************************************************************************** +Fold: 2 +Accuracy: 0.78271 +F1 Score: 0.42730 +Precision: 0.42002 +Recall: 0.45670 +******************************************************************************** +Fold: 3 +Accuracy: 0.78715 +F1 Score: 0.41108 +Precision: 0.39829 +Recall: 0.44992 +******************************************************************************** +Fold: 4 +Accuracy: 0.79115 +F1 Score: 0.41810 +Precision: 0.40095 +Recall: 0.45760 +******************************************************************************** +Fold: 5 +Accuracy: 0.76271 +F1 Score: 0.41752 +Precision: 0.41156 +Recall: 0.44899 diff --git a/train/classification_bert_complete_desc_unit/classification_prediction/predict.py b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py new file mode 100644 index 0000000..c5e0be5 --- /dev/null +++ b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py @@ -0,0 +1,241 @@ +# %% + +# from datasets import load_from_disk +import os +import glob + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from torch.utils.data import DataLoader + +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + +from tqdm import tqdm + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + unit = f"{row['unit']}" + + pattern = f"{row['thing'] + row['property']}" + try: + index = mdm_list.index(pattern) + except ValueError: + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_dataset(fold, mdm_list): + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + test_df = pd.read_csv(data_path, skipinitialspace=True) + # we only use the mdm subset + test_df = test_df[test_df['MDM']].reset_index(drop=True) + + test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list)) + + return test_dataset + + +# %% + +# function to perform training for a given fold +def test(fold): + + test_dataset = create_dataset(fold, mdm_list) + + # prepare tokenizer + + checkpoint_directory = f'../checkpoint_fold_{fold}' + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0] + + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + # %% + # compute max token length + max_length = 0 + for sample in test_dataset['text']: + # Tokenize the sample and get the length + input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"] + length = len(input_ids) + + # Update max_length if this sample is longer + if length > max_length: + max_length = length + + print(max_length) + + # %% + + max_length = 64 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + # truncation=True, + padding='max_length' + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length") + + # %% + # compute metrics + # metric = evaluate.load("accuracy") + # + # + # def compute_metrics(eval_preds): + # preds, labels = eval_preds + # preds = np.argmax(preds, axis=1) + # return metric.compute(predictions=preds, references=labels) + + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + model = model.eval() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + + pred_labels = [] + actual_labels = [] + + + BATCH_SIZE = 64 + dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False) + for batch in tqdm(dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + actual_labels.extend(batch['label']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + + # Perform inference + with torch.no_grad(): + logits = model( + input_ids, + attention_mask).logits + predicted_class_ids = logits.argmax(dim=1).to("cpu") + pred_labels.extend(predicted_class_ids) + + pred_labels = [tensor.item() for tensor in pred_labels] + + + # %% + from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix + y_true = actual_labels + y_pred = pred_labels + + # Compute metrics + accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + precision = precision_score(y_true, y_pred, average='macro') + recall = recall_score(y_true, y_pred, average='macro') + + with open("output.txt", "a") as f: + + print('*' * 80, file=f) + print(f'Fold: {fold}', file=f) + # Print the results + print(f'Accuracy: {accuracy:.5f}', file=f) + print(f'F1 Score: {f1:.5f}', file=f) + print(f'Precision: {precision:.5f}', file=f) + print(f'Recall: {recall:.5f}', file=f) + + +# %% +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + test(fold) diff --git a/train/classification_bert_complete_desc_unit/train.py b/train/classification_bert_complete_desc_unit/train.py new file mode 100644 index 0000000..0a0d67c --- /dev/null +++ b/train/classification_bert_complete_desc_unit/train.py @@ -0,0 +1,217 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + EarlyStoppingCallback, + TrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# %% + +# we need to create the mdm_list +# import the full mdm-only file +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +# rather than use pattern, we use the real thing and property +# mdm_list = sorted(list((set(full_df['pattern'])))) +thing_property = full_df['thing'] + full_df['property'] +thing_property = thing_property.to_list() +mdm_list = sorted(list(set(thing_property))) + + +# %% +id2label = {} +label2id = {} +for idx, val in enumerate(mdm_list): + id2label[idx] = val + label2id[val] = idx + +# %% + +# outputs a list of dictionaries +# processes dataframe into lists of dictionaries +# each element maps input to output +# input: tag_description +# output: class label +def process_df_to_dict(df, mdm_list): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + pattern = f"{row['thing'] + row['property']}" + unit = f"{row['unit']}" + try: + index = mdm_list.index(pattern) + except ValueError: + print("Error: value not found in MDM list") + index = -1 + element = { + 'text' : f"{desc}{unit}", + 'label': index, + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold, mdm_list): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)), + }) + return combined_data + + +# %% + +# function to perform training for a given fold +def train(fold): + + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold, mdm_list) + + # prepare tokenizer + + # model_checkpoint = "distilbert/distilbert-base-uncased" + model_checkpoint = 'google-bert/bert-base-uncased' + tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['text'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns="text", + ) + + # %% temp + # tokenized_datasets['train'].rename_columns() + + # %% + # create data collator + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # %% + # compute metrics + metric = evaluate.load("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + preds = np.argmax(preds, axis=1) + return metric.compute(predictions=preds, references=labels) + + # %% + # create id2label and label2id + + + # %% + model = AutoModelForSequenceClassification.from_pretrained( + model_checkpoint, + num_labels=len(mdm_list), + id2label=id2label, + label2id=label2id) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # %% + # Trainer + + training_args = TrainingArguments( + output_dir=f"{save_path}", + # eval_strategy="epoch", + eval_strategy="no", + logging_dir="tensorboard-log", + logging_strategy="epoch", + # save_strategy="epoch", + load_best_model_at_end=False, + learning_rate=1e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=80, + bf16=True, + push_to_hub=False, + remove_unused_columns=False, + ) + + + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) + + +# %% diff --git a/train/classification_bert_pattern_desc/.gitignore b/train/classification_bert_pattern_desc/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/train/classification_bert_pattern_desc/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/train/classification_bert/classification_prediction/predict.py b/train/classification_bert_pattern_desc/classification_prediction/predict.py similarity index 100% rename from train/classification_bert/classification_prediction/predict.py rename to train/classification_bert_pattern_desc/classification_prediction/predict.py diff --git a/train/classification_bert_desc/train.py b/train/classification_bert_pattern_desc/train.py similarity index 100% rename from train/classification_bert_desc/train.py rename to train/classification_bert_pattern_desc/train.py diff --git a/train/classification_bert_pattern_desc_unit/.gitignore b/train/classification_bert_pattern_desc_unit/.gitignore new file mode 100644 index 0000000..2c8f0d6 --- /dev/null +++ b/train/classification_bert_pattern_desc_unit/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log diff --git a/train/classification_bert_desc/classification_prediction/predict.py b/train/classification_bert_pattern_desc_unit/classification_prediction/predict.py similarity index 100% rename from train/classification_bert_desc/classification_prediction/predict.py rename to train/classification_bert_pattern_desc_unit/classification_prediction/predict.py diff --git a/train/classification_bert/train.py b/train/classification_bert_pattern_desc_unit/train.py similarity index 100% rename from train/classification_bert/train.py rename to train/classification_bert_pattern_desc_unit/train.py diff --git a/train/classification_all/.gitignore b/train/classification_t5_complete/.gitignore similarity index 100% rename from train/classification_all/.gitignore rename to train/classification_t5_complete/.gitignore diff --git a/train/classification_all/train.py b/train/classification_t5_complete/train.py similarity index 100% rename from train/classification_all/train.py rename to train/classification_t5_complete/train.py diff --git a/train/classification_all/utils.py b/train/classification_t5_complete/utils.py similarity index 100% rename from train/classification_all/utils.py rename to train/classification_t5_complete/utils.py diff --git a/train/classification_all_with_contrastive/.gitignore b/train/classification_t5_complete_with_constrastive/.gitignore similarity index 100% rename from train/classification_all_with_contrastive/.gitignore rename to train/classification_t5_complete_with_constrastive/.gitignore diff --git a/train/classification_all_with_contrastive/train.py b/train/classification_t5_complete_with_constrastive/train.py similarity index 100% rename from train/classification_all_with_contrastive/train.py rename to train/classification_t5_complete_with_constrastive/train.py diff --git a/train/classification_all_with_contrastive/utils.py b/train/classification_t5_complete_with_constrastive/utils.py similarity index 100% rename from train/classification_all_with_contrastive/utils.py rename to train/classification_t5_complete_with_constrastive/utils.py diff --git a/train/classification_mdm_with_contrastive/.gitignore b/train/classification_t5_mdm_with_contrastive/.gitignore similarity index 100% rename from train/classification_mdm_with_contrastive/.gitignore rename to train/classification_t5_mdm_with_contrastive/.gitignore diff --git a/train/classification_mdm_with_contrastive/train.py b/train/classification_t5_mdm_with_contrastive/train.py similarity index 100% rename from train/classification_mdm_with_contrastive/train.py rename to train/classification_t5_mdm_with_contrastive/train.py diff --git a/train/classification_mdm_with_contrastive/utils.py b/train/classification_t5_mdm_with_contrastive/utils.py similarity index 100% rename from train/classification_mdm_with_contrastive/utils.py rename to train/classification_t5_mdm_with_contrastive/utils.py diff --git a/train/mapping_pattern/.gitignore b/train/mapping_t5_complete_desc_unit/.gitignore similarity index 100% rename from train/mapping_pattern/.gitignore rename to train/mapping_t5_complete_desc_unit/.gitignore diff --git a/train/mapping_pattern/mapping_prediction/.gitignore b/train/mapping_t5_complete_desc_unit/mapping_prediction/.gitignore similarity index 100% rename from train/mapping_pattern/mapping_prediction/.gitignore rename to train/mapping_t5_complete_desc_unit/mapping_prediction/.gitignore diff --git a/train/mapping_with_unit/mapping_prediction/inference.py b/train/mapping_t5_complete_desc_unit/mapping_prediction/inference.py similarity index 100% rename from train/mapping_with_unit/mapping_prediction/inference.py rename to train/mapping_t5_complete_desc_unit/mapping_prediction/inference.py diff --git a/train/mapping_with_unit/mapping_prediction/predict.py b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py similarity index 100% rename from train/mapping_with_unit/mapping_prediction/predict.py rename to train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py diff --git a/train/mapping_with_unit/train.py b/train/mapping_t5_complete_desc_unit/train.py similarity index 100% rename from train/mapping_with_unit/train.py rename to train/mapping_t5_complete_desc_unit/train.py diff --git a/train/mapping_with_unit/.gitignore b/train/mapping_t5_pattern_desc_unit/.gitignore similarity index 100% rename from train/mapping_with_unit/.gitignore rename to train/mapping_t5_pattern_desc_unit/.gitignore diff --git a/train/mapping_with_unit/mapping_prediction/.gitignore b/train/mapping_t5_pattern_desc_unit/mapping_prediction/.gitignore similarity index 100% rename from train/mapping_with_unit/mapping_prediction/.gitignore rename to train/mapping_t5_pattern_desc_unit/mapping_prediction/.gitignore diff --git a/train/mapping_pattern/mapping_prediction/inference.py b/train/mapping_t5_pattern_desc_unit/mapping_prediction/inference.py similarity index 100% rename from train/mapping_pattern/mapping_prediction/inference.py rename to train/mapping_t5_pattern_desc_unit/mapping_prediction/inference.py diff --git a/train/mapping_pattern/mapping_prediction/output.txt b/train/mapping_t5_pattern_desc_unit/mapping_prediction/output.txt similarity index 100% rename from train/mapping_pattern/mapping_prediction/output.txt rename to train/mapping_t5_pattern_desc_unit/mapping_prediction/output.txt diff --git a/train/mapping_pattern/mapping_prediction/predict.py b/train/mapping_t5_pattern_desc_unit/mapping_prediction/predict.py similarity index 100% rename from train/mapping_pattern/mapping_prediction/predict.py rename to train/mapping_t5_pattern_desc_unit/mapping_prediction/predict.py diff --git a/train/mapping_pattern/train.py b/train/mapping_t5_pattern_desc_unit/train.py similarity index 100% rename from train/mapping_pattern/train.py rename to train/mapping_t5_pattern_desc_unit/train.py diff --git a/translation/README.md b/translation/README.md index 35b1a31..129a5c3 100644 --- a/translation/README.md +++ b/translation/README.md @@ -1,3 +1,3 @@ # translation -These files were from the GRS paper. These codes will not be used. \ No newline at end of file +This section is depracated in favor of the `train` folder. \ No newline at end of file