Chore: moved selection to post_process, mapping to test

2024-10-31 16:35:28 +09:00 · 2024-10-31 16:35:28 +09:00 · 18e4a5f7df
parent 16374b9ab8
commit 18e4a5f7df
14 changed files with 48 additions and 203 deletions
--- a/post_process/README.md
+++ b/post_process/README.md
@ -0,0 +1,9 @@
 # Post-Processing
 ## What is this folder
 This folder contains the files for post-processing.
 We divide each processing method into their respective folders to modularize the
 post-processing methods. This helps to make it easier to test different methods
 and reduce coupling between stages.
--- a/post_process/classification/train.py
+++ b/post_process/classification/train.py
--- a/post_process/selection/.gitignore
+++ b/post_process/selection/.gitignore
--- a/post_process/selection/output.txt
+++ b/post_process/selection/output.txt
--- a/post_process/selection/predict.py
+++ b/post_process/selection/predict.py
@ -1,14 +1,13 @@
 import pandas as pd
 import os
 import glob
 from inference import Inference
 # directory for checkpoints
 checkpoint_directory =  '../../train/baseline'
-def infer_and_select(fold):
+def select(fold):
    # import test data
-    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    data_path = f"../../train/mapping/exports/result_group_{fold}.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)
    # get target data
@ -18,37 +17,11 @@ def infer_and_select(fold):
    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
    ##########################################
    # run inference
    # checkpoint
    # Use glob to find matching paths
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
    infer = Inference(checkpoint_path)
    infer.prepare_dataloader(df, batch_size=256, max_length=64)
    thing_prediction_list, property_prediction_list = infer.generate()
    # add labels too
    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
    # Convert the list to a Pandas DataFrame
    df_out = pd.DataFrame({
        'p_thing': thing_prediction_list, 
        'p_property': property_prediction_list
    })
    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
    df = pd.concat([df, df_out], axis=1)
    ##########################################
    # Process the dataframe for selection
    # we start to cull predictions from here
-    data_master_path = f"../../data_import/exports/data_model_master_export.csv"
+    data_master_path = "../../data_import/exports/data_model_master_export.csv"
    df_master = pd.read_csv(data_master_path, skipinitialspace=True)
    data_mapping = df
    # Generate patterns    
@ -75,14 +48,14 @@ def infer_and_select(fold):
-    condition1 = df['MDM']
+    # condition1 = df['MDM']
-    condition2 = df['p_MDM']
+    # condition2 = df['p_MDM']
-    condition_correct_thing = df['p_thing'] == df['thing']
+    # condition_correct_thing = df['p_thing'] == df['thing']
-    condition_correct_property = df['p_property'] == df['property']
+    # condition_correct_property = df['p_property'] == df['property']
-    match = sum(condition1 & condition2)
+    # match = sum(condition1 & condition2)
-    fn = sum(condition1 & ~condition2)
+    # fn = sum(condition1 & ~condition2)
-    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
+    # prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
    # print("mdm match predicted mdm: ", match)  # 56 - false negative
    # print("mdm but not predicted mdm: ", fn)  # 56 - false negative
@ -101,6 +74,17 @@ def infer_and_select(fold):
    import selection
    # importlib.reload(selection)
    selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
    ##########################################
    # run inference
    # checkpoint
    # Use glob to find matching paths
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
    tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)
@ -126,4 +110,4 @@ with open("output.txt", "w") as f:
    print('', file=f)
 for fold in [1,2,3,4,5]:
-    infer_and_select(fold)
+    select(fold)
--- a/post_process/selection/selection.py
+++ b/post_process/selection/selection.py
--- a/post_process/selection/utils.py
+++ b/post_process/selection/utils.py
--- a/test/selection/inference.py
+++ b/test/selection/inference.py
@ -1,164 +0,0 @@
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
 )
 import glob
 import os
 import pandas as pd
 from tqdm import tqdm
 from datasets import Dataset
 import numpy as np
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 class Inference():
    tokenizer: T5TokenizerFast
    model: torch.nn.Module
    dataloader: DataLoader
    def __init__(self, checkpoint_path):
        self._create_tokenizer()
        self._load_model(checkpoint_path)
    def _create_tokenizer(self):
        # %%
        # load tokenizer
        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
        # Define additional special tokens
        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
        # Add the additional special tokens to the tokenizer
        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    def _load_model(self, checkpoint_path: str):
        # load model
        # Define the directory and the pattern
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
        model = torch.compile(model)
        # set model to eval
        self.model = model.eval()
    def prepare_dataloader(self, input_df, batch_size, max_length):
        """
        *arguments*
        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
        - batch_size: the batch size of dataloader output
        - max_length: length of tokenizer output
        """
        print("preparing dataloader")
        # convert each dataframe row into a dictionary
        # outputs a list of dictionaries
        def _process_df(df):
            output_list = [{
                    'input': f"<DESC>{row['tag_description']}<DESC>",
                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
            } for _, row in df.iterrows()]
            return output_list
        def _preprocess_function(example):
            input = example['input']
            target = example['output']
            # text_target sets the corresponding label to inputs
            # there is no need to create a separate 'labels'
            model_inputs = self.tokenizer(
                input,
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
                padding='max_length',
                truncation=True,
            )
            return model_inputs
        test_dataset = Dataset.from_list(_process_df(input_df))
        # map maps function to each "row" in the dataset
        # aka the data in the immediate nesting
        datasets = test_dataset.map(
            _preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=test_dataset.column_names,
        )
        # datasets = _preprocess_function(test_dataset)
        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        # create dataloader
        self.dataloader = DataLoader(datasets, batch_size=batch_size)
    def generate(self):
        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128
        pred_generations = []
        pred_labels = []
        print("start generation")
        for batch in tqdm(self.dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            pred_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            self.model.to(device)
            # Perform inference
            with torch.no_grad():
                outputs = self.model.generate(input_ids,
                                        attention_mask=attention_mask,
                                        max_length=MAX_GENERATE_LENGTH)
                # Decode the output and print the results
                pred_generations.extend(outputs.to("cpu"))
        # %%
        # extract sequence and decode
        def extract_seq(tokens, start_value, end_value):
            if start_value not in tokens or end_value not in tokens:
                return None  # Or handle this case according to your requirements
            start_id = np.where(tokens == start_value)[0][0]
            end_id = np.where(tokens == end_value)[0][0]
            return tokens[start_id+1:end_id]
        def process_tensor_output(tokens):
            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
            p_thing = None
            p_property = None
            if (thing_seq is not None):
                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
            if (property_seq is not None):
                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
            return p_thing, p_property
        # decode prediction labels
        def decode_preds(tokens_list):
            thing_prediction_list = []
            property_prediction_list = []
            for tokens in tokens_list:
                p_thing, p_property = process_tensor_output(tokens)
                thing_prediction_list.append(p_thing)
                property_prediction_list.append(p_property)
            return thing_prediction_list, property_prediction_list 
        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
        return thing_prediction_list, property_prediction_list
--- a/train/README.md
+++ b/train/README.md
@ -0,0 +1,12 @@
 # Train
 ## What is this folder
 Here contains the code for training and mapping evaluation.
 Each folder contains a training variation.
 After training, each folder contains the checkpoint files for each fold.
 `mapping` directory contains the code to run the model on test data and also
 produce the csv outputs.
--- a/test/selection/.gitignore
+++ b/test/selection/.gitignore
@ -1 +1,2 @@
 __pycache__
 exports/
--- a/train/mapping/inference.py
+++ b/train/mapping/inference.py
--- a/train/mapping/output.txt
+++ b/train/mapping/output.txt
--- a/train/mapping/predict.py
+++ b/train/mapping/predict.py
@ -47,7 +47,7 @@ def infer_and_select(fold):
    df = pd.concat([df, df_out], axis=1)
    # we can save the t5 generation output here
-    # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")
+    df.to_csv(f"exports/result_group_{fold}.csv")
    # here we want to evaluate mapping accuracy within the valid in mdm data only
    in_mdm = df['MDM']
--- a/translation/README.md
+++ b/translation/README.md
@ -0,0 +1,3 @@
 # translation
 These files were from the GRS paper. These codes will not be used.
		`@ -0,0 +1,3 @@`
							`# translation`

							`These files were from the GRS paper. These codes will not be used.`