From 16374b9ab8d549ae54986ca9592677846df59be5 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Thu, 31 Oct 2024 15:58:20 +0900 Subject: [PATCH] Feat: added train and test directories --- data_preprocess/split_data.py | 13 ++- test/mapping/.gitignore | 1 + test/mapping/inference.py | 162 ++++++++++++++++++++++++++++ test/mapping/output.txt | 6 ++ test/mapping/predict.py | 71 +++++++++++++ test/selection/.gitignore | 1 + test/selection/inference.py | 164 ++++++++++++++++++++++++++++ test/selection/output.txt | 56 ++++++++++ test/selection/predict.py | 129 ++++++++++++++++++++++ test/selection/selection.py | 187 ++++++++++++++++++++++++++++++++ test/selection/utils.py | 76 +++++++++++++ train/baseline/.gitignore | 2 + train/baseline/train.py | 195 ++++++++++++++++++++++++++++++++++ 13 files changed, 1059 insertions(+), 4 deletions(-) create mode 100644 test/mapping/.gitignore create mode 100644 test/mapping/inference.py create mode 100644 test/mapping/output.txt create mode 100644 test/mapping/predict.py create mode 100644 test/selection/.gitignore create mode 100644 test/selection/inference.py create mode 100644 test/selection/output.txt create mode 100644 test/selection/predict.py create mode 100644 test/selection/selection.py create mode 100644 test/selection/utils.py create mode 100644 train/baseline/.gitignore create mode 100644 train/baseline/train.py diff --git a/data_preprocess/split_data.py b/data_preprocess/split_data.py index b9aa141..50576e7 100644 --- a/data_preprocess/split_data.py +++ b/data_preprocess/split_data.py @@ -158,7 +158,7 @@ data_file_path = 'exports/preprocessed_data.csv' data = pd.read_csv(data_file_path) # Filter the data where MDM is True -mdm_true = data[data['MDM'] == True].copy() # .copy()를 사용하여 명시적으로 복사본 생성 +mdm_true = data[data['MDM']].copy() # .copy()를 사용하여 명시적으로 복사본 생성 mdm_all = data.copy() # Create a new column combining 'thing' and 'property' @@ -340,7 +340,7 @@ def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_s # Create the test dataset by including only group i test_group_ships = groups[i] - test_data = mdm[mdm['ships_idx'].isin(test_group_ships)] + # test_data = mdm[mdm['ships_idx'].isin(test_group_ships)] # Extract corresponding entries from the external test dataset test_all_data = data[data['ships_idx'].isin(test_group_ships)] @@ -363,16 +363,21 @@ def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_s train_all_data = pd.concat([final_train_data, valid_data]) # Save datasets to CSV files + # train.csv: mdm training set + # valid.csv: mdm validation set + # test.csv: mdm test set + # test_all.csv: all test set with non-mdm + # train_all.csv: all train set with non-mdm train_file_path = os.path.join(group_folder, 'train.csv') valid_file_path = os.path.join(group_folder, 'valid.csv') - test_file_path = os.path.join(group_folder, 'test.csv') + # test_file_path = os.path.join(group_folder, 'test.csv') test_all_file_path = os.path.join(group_folder, 'test_all.csv') train_all_file_path = os.path.join(group_folder, 'train_all.csv') final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig') valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig') # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig') - test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig') + test_all_data.to_csv(test_all_file_path, index=False, encoding='utf-8-sig') train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig') print(f"Group {i + 1} datasets saved in {group_folder}") diff --git a/test/mapping/.gitignore b/test/mapping/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/test/mapping/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/test/mapping/inference.py b/test/mapping/inference.py new file mode 100644 index 0000000..2232be0 --- /dev/null +++ b/test/mapping/inference.py @@ -0,0 +1,162 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + def _process_df(df): + output_list = [{ + 'input': f"{row['tag_description']}", + 'output': f"{row['thing']}{row['property']}", + } for _, row in df.iterrows()] + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/test/mapping/output.txt b/test/mapping/output.txt new file mode 100644 index 0000000..da49e44 --- /dev/null +++ b/test/mapping/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.9171793658305727 +Accuracy for fold 2: 0.9051401869158878 +Accuracy for fold 3: 0.9688755020080321 +Accuracy for fold 4: 0.9624167459562322 +Accuracy for fold 5: 0.8896014658726523 diff --git a/test/mapping/predict.py b/test/mapping/predict.py new file mode 100644 index 0000000..bf87cb0 --- /dev/null +++ b/test/mapping/predict.py @@ -0,0 +1,71 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../../train/baseline' + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + + # get target data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=256, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet") + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/test/selection/.gitignore b/test/selection/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/test/selection/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/test/selection/inference.py b/test/selection/inference.py new file mode 100644 index 0000000..896cb72 --- /dev/null +++ b/test/selection/inference.py @@ -0,0 +1,164 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import glob +import os +import pandas as pd +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + def _process_df(df): + output_list = [{ + 'input': f"{row['tag_description']}", + 'output': f"{row['thing']}{row['property']}", + } for _, row in df.iterrows()] + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/test/selection/output.txt b/test/selection/output.txt new file mode 100644 index 0000000..245955f --- /dev/null +++ b/test/selection/output.txt @@ -0,0 +1,56 @@ + +******************************************************************************** +Statistics for fold 1 +tp: 1792 +tn: 10533 +fp: 428 +fn: 321 +fold: 1 +accuracy: 0.9427107235735047 +f1_score: 0.827140549273021 +precision: 0.8072072072072072 +recall: 0.8480832938949361 +******************************************************************************** +Statistics for fold 2 +tp: 1875 +tn: 8189 +fp: 393 +fn: 265 +fold: 2 +accuracy: 0.9386308524529006 +f1_score: 0.8507259528130672 +precision: 0.8267195767195767 +recall: 0.8761682242990654 +******************************************************************************** +Statistics for fold 3 +tp: 1831 +tn: 7455 +fp: 408 +fn: 161 +fold: 3 +accuracy: 0.9422628107559614 +f1_score: 0.8655164263767431 +precision: 0.8177757927646271 +recall: 0.9191767068273092 +******************************************************************************** +Statistics for fold 4 +tp: 1909 +tn: 12866 +fp: 483 +fn: 193 +fold: 4 +accuracy: 0.9562487864863116 +f1_score: 0.8495772140631954 +precision: 0.7980769230769231 +recall: 0.9081826831588963 +******************************************************************************** +Statistics for fold 5 +tp: 1928 +tn: 10359 +fp: 427 +fn: 255 +fold: 5 +accuracy: 0.9474130619168787 +f1_score: 0.8497135301895108 +precision: 0.818683651804671 +recall: 0.8831882730187814 diff --git a/test/selection/predict.py b/test/selection/predict.py new file mode 100644 index 0000000..2338cf9 --- /dev/null +++ b/test/selection/predict.py @@ -0,0 +1,129 @@ +import pandas as pd +import os +import glob +from inference import Inference + +# directory for checkpoints +checkpoint_directory = '../../train/baseline' + +def infer_and_select(fold): + # import test data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + + # get target data + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=256, max_length=64) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + ########################################## + # Process the dataframe for selection + + # we start to cull predictions from here + data_master_path = f"../../data_import/exports/data_model_master_export.csv" + df_master = pd.read_csv(data_master_path, skipinitialspace=True) + data_mapping = df + # Generate patterns + data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True) + data_mapping['property_pattern'] = data_mapping['property'].str.replace(r'\d', '#', regex=True) + data_mapping['pattern'] = data_mapping['thing_pattern'] + " " + data_mapping['property_pattern'] + df_master['master_pattern'] = df_master['thing'] + " " + df_master['property'] + # Create a set of unique patterns from master for fast lookup + master_patterns = set(df_master['master_pattern']) + # thing_patterns = set(df_master['thing']) + # Check each pattern in data_mapping if it exists in df_master and assign the "MDM" field + data_mapping['MDM'] = data_mapping['pattern'].apply(lambda x: x in master_patterns) + + # check if prediction is in MDM + data_mapping['p_thing_pattern'] = data_mapping['p_thing'].str.replace(r'\d', '#', regex=True) + data_mapping['p_property_pattern'] = data_mapping['p_property'].str.replace(r'\d', '#', regex=True) + data_mapping['p_pattern'] = data_mapping['p_thing_pattern'] + " " + data_mapping['p_property_pattern'] + data_mapping['p_MDM'] = data_mapping['p_pattern'].apply(lambda x: x in master_patterns) + + df = data_mapping + + # we can save the t5 generation output here + # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet") + + + + condition1 = df['MDM'] + condition2 = df['p_MDM'] + + condition_correct_thing = df['p_thing'] == df['thing'] + condition_correct_property = df['p_property'] == df['property'] + match = sum(condition1 & condition2) + fn = sum(condition1 & ~condition2) + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1) + + # print("mdm match predicted mdm: ", match) # 56 - false negative + # print("mdm but not predicted mdm: ", fn) # 56 - false negative + # print("total mdm: ", sum(condition1)) # 2113 + # print("total predicted mdm: ", sum(condition2)) # 6896 - a lot of false positives + # print("correct mdm predicted", prediction_mdm_correct) + + + # selection + ########################################### + # we now have to perform selection + # we restrict to predictions of a class of a ship + # then perform similarity selection with in-distribution data + # the magic is in performing per-class selection, not global + # import importlib + import selection + # importlib.reload(selection) + selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold) + tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path) + + + # write output to file output.txt + with open("output.txt", "a") as f: + print(80 * '*', file=f) + print(f'Statistics for fold {fold}', file=f) + print(f"tp: {tp}", file=f) + print(f"tn: {tn}", file=f) + print(f"fp: {fp}", file=f) + print(f"fn: {fn}", file=f) + print(f"fold: {fold}", file=f) + print("accuracy: ", (tp+tn)/(tp+tn+fp+fn), file=f) + print("f1_score: ", (2*tp)/((2*tp) + fp + fn), file=f) + print("precision: ", (tp)/(tp+fp), file=f) + print("recall: ", (tp)/(tp+fn), file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/test/selection/selection.py b/test/selection/selection.py new file mode 100644 index 0000000..fdbf292 --- /dev/null +++ b/test/selection/selection.py @@ -0,0 +1,187 @@ +import pandas as pd +import numpy as np +from typing import List +from tqdm import tqdm +from utils import Retriever, cosine_similarity_chunked + +class Selector(): + input_df: pd.DataFrame + reference_df: pd.DataFrame + ships_list: List[int] + fold: int + + def __init__(self, input_df, reference_df, fold): + self.ships_list = sorted(list(set(input_df['ships_idx']))) + self.input_df = input_df + self.reference_df = reference_df + self.fold = fold + + + def run_selection(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + # name = f"{row['tag_name']}" + desc = f"{row['tag_description']}" + # element = f"{name}{desc}" + element = f"{desc}" + input_list.append(element) + return input_list + + # given a dataframe, return a single idx of the entry has the highest match with + # the embedding + def selection(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top 5 maximum values along axis 1 + top_k = 1 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + # Calculate the average of the top 5 scores along axis 1 + y_scores = np.mean(top_k_values, axis=1) + max_idx = np.argmax(y_scores) + max_score = y_scores[max_idx] + # convert boolean to indices (1,2,3) + condition_indices = np.where(condition_source)[0] + max_idx = condition_indices[max_idx] + + return max_idx, max_score + + + # prepare reference embed + train_data = list(generate_input_list(self.reference_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_mean_embedding(batch_size=64) + train_embed = retriever_train.embeddings + + # take the inputs for df_sub + test_data = list(generate_input_list(self.input_df)) + retriever_test = Retriever(test_data, checkpoint_path) + retriever_test.make_mean_embedding(batch_size=64) + test_embed = retriever_test.embeddings + + + + # precision_list = [] + # recall_list = [] + tp_accumulate = 0 + tn_accumulate = 0 + fp_accumulate = 0 + fn_accumulate = 0 + THRESHOLD = 0.9 + for ship_idx in self.ships_list: + print(ship_idx) + # we select a ship and select only data exhibiting MDM pattern in the predictions + ship_mask = (self.input_df['ships_idx'] == ship_idx) & (self.input_df['p_MDM']) + df_ship = self.input_df[ship_mask].reset_index(drop=True) + # we then try to make a dataframe for each thing_property attribute + df_ship['thing_property'] = df_ship['p_thing'] + " " + df_ship['p_property'] + unique_patterns = list(set(df_ship['thing_property'])) + condition_list = [] + for pattern in unique_patterns: + # we obtain the boolean mask to subset the source and target entries + condition_source = (df_ship['thing_property'] == pattern) + condition_target = (self.reference_df['thing_property'] == pattern) + item = {'condition_source': condition_source, + 'condition_target': condition_target} + condition_list.append(item) + + # subset part of self.input_df that belongs to the ship + test_embed_subset = test_embed[ship_mask] + cos_sim_matrix = cosine_similarity_chunked(test_embed_subset, train_embed, chunk_size=8).cpu().numpy() + + + # for each sub_df, we have to select the best candidate + # we will do this by finding which desc input has the highest similarity with train data + all_idx_list = [] + selected_idx_list = [] + similarity_score = [] + for item in tqdm(condition_list): + condition_source = item['condition_source'] + condition_target = item['condition_target'] + # if there is no equivalent data in target, we skip + if sum(condition_target) == 0: + pass + # if there is equivalent data in target, we perform selection among source + # by top-k highest similarity with targets + else: + # idx is with respect + max_idx, max_score = selection( + cos_sim_matrix, condition_source, condition_target + ) + all_idx_list.append(max_idx) + similarity_score.append(max_score) + # implement thresholding + if max_score > THRESHOLD: + selected_idx_list.append(max_idx) + + # let us tag the df_ship with the respective 'selected' and 'ood' tags + df_ship['selected'] = False + df_ship.loc[all_idx_list, 'selected'] = True + df_ship['ood'] = 0.0 + df_ship.loc[all_idx_list, 'ood'] = similarity_score + + # we now split the dataframe by p_mdm + # explanation: + # we first separated our ship into p_mdm and non p_mdm + # we only select final in-mdm prediction from p_mdm subset + # anything that is not selected and from non-p_mdm is predicted not in mdm + + # get our final prediction + df_subset_predicted_true = df_ship.loc[selected_idx_list] + # take the set difference between df_ship's index and the given list + inverse_list = df_ship.index.difference(selected_idx_list).to_list() + df_subset_predicted_false = df_ship.loc[inverse_list] + + not_p_mdm_mask = (self.input_df['ships_idx'] == ship_idx) & (~self.input_df['p_MDM']) + # this is the part we don't care + df_not_p_mdm = self.input_df[not_p_mdm_mask].reset_index(drop=True) + + # concat + df_false = pd.concat([df_subset_predicted_false, df_not_p_mdm], axis=0) + assert(len(df_false) + len(df_subset_predicted_true) == sum(self.input_df['ships_idx'] == ship_idx)) + + # we want to return a df with the final prediction + # a bit dirty, but we re-use the fields + df_false['p_MDM'] = False + df_subset_predicted_true['p_MDM'] = True + + + # save ship for analysis later + # df_return = pd.concat([df_false, df_subset_predicted_true], axis=0) + # df_return.to_parquet(f'exports/fold_{self.fold}/ship_{ship_idx}.parquet') + + + + + # true positive -> predicted in mdm, actual in mdm + # we get all the final predictions that are also found in MDM + true_positive = sum(df_subset_predicted_true['MDM']) + # true negative -> predicted not in mdm, and not found in MDM + # we negate the condition to get those that are not found in MDM + true_negative = sum(~df_false['MDM']) + # false positive -> predicted in mdm, not found in mdm + false_positive = sum(~df_subset_predicted_true['MDM']) + # false negative -> predicted not in mdm, found in mdm + false_negative = sum(df_false['MDM']) + + + tp_accumulate = tp_accumulate + true_positive + tn_accumulate = tn_accumulate + true_negative + fp_accumulate = fp_accumulate + false_positive + fn_accumulate = fn_accumulate + false_negative + + + + total_sum = (tp_accumulate + tn_accumulate + fp_accumulate + fn_accumulate) + # ensure that all entries are accounted for + assert(total_sum == len(self.input_df)) + return tp_accumulate, tn_accumulate, fp_accumulate, fn_accumulate + diff --git a/test/selection/utils.py b/test/selection/utils.py new file mode 100644 index 0000000..b2f2116 --- /dev/null +++ b/test/selection/utils.py @@ -0,0 +1,76 @@ +import torch +from tqdm import tqdm +from transformers import AutoTokenizer +from transformers import AutoModelForSeq2SeqLM +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_mean_embedding(self, batch_size=32): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=16): + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + # Expand batch1 chunk and entire batch2 for comparison + batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + + # Compute cosine similarity for the chunk and store it in the final tensor + cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + return cos_sim + diff --git a/train/baseline/.gitignore b/train/baseline/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/train/baseline/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/train/baseline/train.py b/train/baseline/train.py new file mode 100644 index 0000000..f79f2fc --- /dev/null +++ b/train/baseline/train.py @@ -0,0 +1,195 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + element = { + 'input' : f"{desc}", + 'output': f"{row['thing']}{row['property']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 64 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + eval_strategy="epoch", + logging_dir="tensorboard-log", + logging_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=40, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) +