diff --git a/post_process/README.md b/post_process/README.md new file mode 100644 index 0000000..5a135d8 --- /dev/null +++ b/post_process/README.md @@ -0,0 +1,9 @@ +# Post-Processing + +## What is this folder + +This folder contains the files for post-processing. + +We divide each processing method into their respective folders to modularize the +post-processing methods. This helps to make it easier to test different methods +and reduce coupling between stages. diff --git a/post_process/classification/train.py b/post_process/classification/train.py new file mode 100644 index 0000000..e69de29 diff --git a/test/mapping/.gitignore b/post_process/selection/.gitignore similarity index 100% rename from test/mapping/.gitignore rename to post_process/selection/.gitignore diff --git a/test/selection/output.txt b/post_process/selection/output.txt similarity index 100% rename from test/selection/output.txt rename to post_process/selection/output.txt diff --git a/test/selection/predict.py b/post_process/selection/predict.py similarity index 76% rename from test/selection/predict.py rename to post_process/selection/predict.py index 2338cf9..55c88f3 100644 --- a/test/selection/predict.py +++ b/post_process/selection/predict.py @@ -1,14 +1,13 @@ import pandas as pd import os import glob -from inference import Inference # directory for checkpoints checkpoint_directory = '../../train/baseline' -def infer_and_select(fold): +def select(fold): # import test data - data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + data_path = f"../../train/mapping/exports/result_group_{fold}.csv" df = pd.read_csv(data_path, skipinitialspace=True) # get target data @@ -18,37 +17,11 @@ def infer_and_select(fold): train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] - ########################################## - # run inference - # checkpoint - # Use glob to find matching paths - directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') - # Use glob to find matching paths - # path is usually checkpoint_fold_1/checkpoint- - # we are guaranteed to save only 1 checkpoint from training - pattern = 'checkpoint-*' - checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] - - infer = Inference(checkpoint_path) - infer.prepare_dataloader(df, batch_size=256, max_length=64) - thing_prediction_list, property_prediction_list = infer.generate() - - # add labels too - # thing_actual_list, property_actual_list = decode_preds(pred_labels) - # Convert the list to a Pandas DataFrame - df_out = pd.DataFrame({ - 'p_thing': thing_prediction_list, - 'p_property': property_prediction_list - }) - # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] - # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] - df = pd.concat([df, df_out], axis=1) - ########################################## # Process the dataframe for selection # we start to cull predictions from here - data_master_path = f"../../data_import/exports/data_model_master_export.csv" + data_master_path = "../../data_import/exports/data_model_master_export.csv" df_master = pd.read_csv(data_master_path, skipinitialspace=True) data_mapping = df # Generate patterns @@ -75,14 +48,14 @@ def infer_and_select(fold): - condition1 = df['MDM'] - condition2 = df['p_MDM'] + # condition1 = df['MDM'] + # condition2 = df['p_MDM'] - condition_correct_thing = df['p_thing'] == df['thing'] - condition_correct_property = df['p_property'] == df['property'] - match = sum(condition1 & condition2) - fn = sum(condition1 & ~condition2) - prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1) + # condition_correct_thing = df['p_thing'] == df['thing'] + # condition_correct_property = df['p_property'] == df['property'] + # match = sum(condition1 & condition2) + # fn = sum(condition1 & ~condition2) + # prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1) # print("mdm match predicted mdm: ", match) # 56 - false negative # print("mdm but not predicted mdm: ", fn) # 56 - false negative @@ -101,6 +74,17 @@ def infer_and_select(fold): import selection # importlib.reload(selection) selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold) + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path) @@ -126,4 +110,4 @@ with open("output.txt", "w") as f: print('', file=f) for fold in [1,2,3,4,5]: - infer_and_select(fold) + select(fold) diff --git a/test/selection/selection.py b/post_process/selection/selection.py similarity index 100% rename from test/selection/selection.py rename to post_process/selection/selection.py diff --git a/test/selection/utils.py b/post_process/selection/utils.py similarity index 100% rename from test/selection/utils.py rename to post_process/selection/utils.py diff --git a/test/selection/inference.py b/test/selection/inference.py deleted file mode 100644 index 896cb72..0000000 --- a/test/selection/inference.py +++ /dev/null @@ -1,164 +0,0 @@ -import torch -from torch.utils.data import DataLoader -from transformers import ( - T5TokenizerFast, - AutoModelForSeq2SeqLM, -) -import glob -import os -import pandas as pd -from tqdm import tqdm -from datasets import Dataset -import numpy as np - -os.environ['TOKENIZERS_PARALLELISM'] = 'false' - - -class Inference(): - tokenizer: T5TokenizerFast - model: torch.nn.Module - dataloader: DataLoader - - def __init__(self, checkpoint_path): - self._create_tokenizer() - self._load_model(checkpoint_path) - - - def _create_tokenizer(self): - # %% - # load tokenizer - self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) - # Define additional special tokens - additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] - # Add the additional special tokens to the tokenizer - self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) - - def _load_model(self, checkpoint_path: str): - # load model - # Define the directory and the pattern - model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) - model = torch.compile(model) - # set model to eval - self.model = model.eval() - - - - - def prepare_dataloader(self, input_df, batch_size, max_length): - """ - *arguments* - - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' - - batch_size: the batch size of dataloader output - - max_length: length of tokenizer output - """ - print("preparing dataloader") - # convert each dataframe row into a dictionary - # outputs a list of dictionaries - def _process_df(df): - output_list = [{ - 'input': f"{row['tag_description']}", - 'output': f"{row['thing']}{row['property']}", - } for _, row in df.iterrows()] - - return output_list - - def _preprocess_function(example): - input = example['input'] - target = example['output'] - # text_target sets the corresponding label to inputs - # there is no need to create a separate 'labels' - model_inputs = self.tokenizer( - input, - text_target=target, - max_length=max_length, - return_tensors="pt", - padding='max_length', - truncation=True, - ) - return model_inputs - - test_dataset = Dataset.from_list(_process_df(input_df)) - - - # map maps function to each "row" in the dataset - # aka the data in the immediate nesting - datasets = test_dataset.map( - _preprocess_function, - batched=True, - num_proc=1, - remove_columns=test_dataset.column_names, - ) - # datasets = _preprocess_function(test_dataset) - datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) - - # create dataloader - self.dataloader = DataLoader(datasets, batch_size=batch_size) - - - def generate(self): - device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') - MAX_GENERATE_LENGTH = 128 - - pred_generations = [] - pred_labels = [] - - print("start generation") - for batch in tqdm(self.dataloader): - # Inference in batches - input_ids = batch['input_ids'] - attention_mask = batch['attention_mask'] - # save labels too - pred_labels.extend(batch['labels']) - - - # Move to GPU if available - input_ids = input_ids.to(device) - attention_mask = attention_mask.to(device) - self.model.to(device) - - # Perform inference - with torch.no_grad(): - outputs = self.model.generate(input_ids, - attention_mask=attention_mask, - max_length=MAX_GENERATE_LENGTH) - - # Decode the output and print the results - pred_generations.extend(outputs.to("cpu")) - - - - # %% - # extract sequence and decode - def extract_seq(tokens, start_value, end_value): - if start_value not in tokens or end_value not in tokens: - return None # Or handle this case according to your requirements - start_id = np.where(tokens == start_value)[0][0] - end_id = np.where(tokens == end_value)[0][0] - - return tokens[start_id+1:end_id] - - - def process_tensor_output(tokens): - thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = - property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = - p_thing = None - p_property = None - if (thing_seq is not None): - p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) - if (property_seq is not None): - p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) - return p_thing, p_property - - # decode prediction labels - def decode_preds(tokens_list): - thing_prediction_list = [] - property_prediction_list = [] - for tokens in tokens_list: - p_thing, p_property = process_tensor_output(tokens) - thing_prediction_list.append(p_thing) - property_prediction_list.append(p_property) - return thing_prediction_list, property_prediction_list - - thing_prediction_list, property_prediction_list = decode_preds(pred_generations) - return thing_prediction_list, property_prediction_list - diff --git a/train/README.md b/train/README.md new file mode 100644 index 0000000..8690a35 --- /dev/null +++ b/train/README.md @@ -0,0 +1,12 @@ +# Train + +## What is this folder + +Here contains the code for training and mapping evaluation. + +Each folder contains a training variation. + +After training, each folder contains the checkpoint files for each fold. + +`mapping` directory contains the code to run the model on test data and also +produce the csv outputs. \ No newline at end of file diff --git a/test/selection/.gitignore b/train/mapping/.gitignore similarity index 57% rename from test/selection/.gitignore rename to train/mapping/.gitignore index bee8a64..e9ebfc9 100644 --- a/test/selection/.gitignore +++ b/train/mapping/.gitignore @@ -1 +1,2 @@ __pycache__ +exports/ diff --git a/test/mapping/inference.py b/train/mapping/inference.py similarity index 100% rename from test/mapping/inference.py rename to train/mapping/inference.py diff --git a/test/mapping/output.txt b/train/mapping/output.txt similarity index 100% rename from test/mapping/output.txt rename to train/mapping/output.txt diff --git a/test/mapping/predict.py b/train/mapping/predict.py similarity index 97% rename from test/mapping/predict.py rename to train/mapping/predict.py index bf87cb0..fe54e2e 100644 --- a/test/mapping/predict.py +++ b/train/mapping/predict.py @@ -47,7 +47,7 @@ def infer_and_select(fold): df = pd.concat([df, df_out], axis=1) # we can save the t5 generation output here - # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet") + df.to_csv(f"exports/result_group_{fold}.csv") # here we want to evaluate mapping accuracy within the valid in mdm data only in_mdm = df['MDM'] diff --git a/translation/README.md b/translation/README.md new file mode 100644 index 0000000..35b1a31 --- /dev/null +++ b/translation/README.md @@ -0,0 +1,3 @@ +# translation + +These files were from the GRS paper. These codes will not be used. \ No newline at end of file