Chore: moved selection to post_process, mapping to test

2024-10-31 16:35:28 +09:00 · 2024-10-31 16:35:28 +09:00 · 18e4a5f7df
parent 16374b9ab8
commit 18e4a5f7df
14 changed files with 48 additions and 203 deletions
--- a/post_process/README.md
+++ b/post_process/README.md
@ -0,0 +1,9 @@
+# Post-Processing
+
+## What is this folder
+
+This folder contains the files for post-processing.
+
+We divide each processing method into their respective folders to modularize the
+post-processing methods. This helps to make it easier to test different methods
+and reduce coupling between stages.
--- a/post_process/classification/train.py
+++ b/post_process/classification/train.py
--- a/post_process/selection/.gitignore
+++ b/post_process/selection/.gitignore
--- a/post_process/selection/output.txt
+++ b/post_process/selection/output.txt
--- a/post_process/selection/predict.py
+++ b/post_process/selection/predict.py
@ -1,14 +1,13 @@
 import pandas as pd
 import os
 import glob
-from inference import Inference

 # directory for checkpoints
 checkpoint_directory =  '../../train/baseline'

-def infer_and_select(fold):
+def select(fold):
    # import test data
-    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    data_path = f"../../train/mapping/exports/result_group_{fold}.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)

    # get target data
@ -18,37 +17,11 @@ def infer_and_select(fold):
    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']


-    ##########################################
-    # run inference
-    # checkpoint
-    # Use glob to find matching paths
-    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
-    # Use glob to find matching paths
-    # path is usually checkpoint_fold_1/checkpoint-<step number>
-    # we are guaranteed to save only 1 checkpoint from training
-    pattern = 'checkpoint-*'
-    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
-
-    infer = Inference(checkpoint_path)
-    infer.prepare_dataloader(df, batch_size=256, max_length=64)
-    thing_prediction_list, property_prediction_list = infer.generate()
-
-    # add labels too
-    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
-    # Convert the list to a Pandas DataFrame
-    df_out = pd.DataFrame({
-        'p_thing': thing_prediction_list, 
-        'p_property': property_prediction_list
-    })
-    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
-    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
-    df = pd.concat([df, df_out], axis=1)
-
    ##########################################
    # Process the dataframe for selection

    # we start to cull predictions from here
-    data_master_path = f"../../data_import/exports/data_model_master_export.csv"
+    data_master_path = "../../data_import/exports/data_model_master_export.csv"
    df_master = pd.read_csv(data_master_path, skipinitialspace=True)
    data_mapping = df
    # Generate patterns    
@ -75,14 +48,14 @@ def infer_and_select(fold):



-    condition1 = df['MDM']
-    condition2 = df['p_MDM']
+    # condition1 = df['MDM']
+    # condition2 = df['p_MDM']

-    condition_correct_thing = df['p_thing'] == df['thing']
-    condition_correct_property = df['p_property'] == df['property']
-    match = sum(condition1 & condition2)
-    fn = sum(condition1 & ~condition2)
-    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
+    # condition_correct_thing = df['p_thing'] == df['thing']
+    # condition_correct_property = df['p_property'] == df['property']
+    # match = sum(condition1 & condition2)
+    # fn = sum(condition1 & ~condition2)
+    # prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)

    # print("mdm match predicted mdm: ", match)  # 56 - false negative
    # print("mdm but not predicted mdm: ", fn)  # 56 - false negative
@ -101,6 +74,17 @@ def infer_and_select(fold):
    import selection
    # importlib.reload(selection)
    selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
+
+    ##########################################
+    # run inference
+    # checkpoint
+    # Use glob to find matching paths
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
    tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)


@ -126,4 +110,4 @@ with open("output.txt", "w") as f:
    print('', file=f)

 for fold in [1,2,3,4,5]:
-    infer_and_select(fold)
+    select(fold)
--- a/post_process/selection/selection.py
+++ b/post_process/selection/selection.py
--- a/post_process/selection/utils.py
+++ b/post_process/selection/utils.py
--- a/test/selection/inference.py
+++ b/test/selection/inference.py
@ -1,164 +0,0 @@
-import torch
-from torch.utils.data import DataLoader
-from transformers import (
-    T5TokenizerFast,
-    AutoModelForSeq2SeqLM,
-)
-import glob
-import os
-import pandas as pd
-from tqdm import tqdm
-from datasets import Dataset
-import numpy as np
-
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'
-
-
-class Inference():
-    tokenizer: T5TokenizerFast
-    model: torch.nn.Module
-    dataloader: DataLoader
-
-    def __init__(self, checkpoint_path):
-        self._create_tokenizer()
-        self._load_model(checkpoint_path)
-
-
-    def _create_tokenizer(self):
-        # %%
-        # load tokenizer
-        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
-        # Define additional special tokens
-        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
-        # Add the additional special tokens to the tokenizer
-        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
-
-    def _load_model(self, checkpoint_path: str):
-        # load model
-        # Define the directory and the pattern
-        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
-        model = torch.compile(model)
-        # set model to eval
-        self.model = model.eval()
-
-
-
-
-    def prepare_dataloader(self, input_df, batch_size, max_length):
-        """
-        *arguments*
-        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
-        - batch_size: the batch size of dataloader output
-        - max_length: length of tokenizer output
-        """
-        print("preparing dataloader")
-        # convert each dataframe row into a dictionary
-        # outputs a list of dictionaries
-        def _process_df(df):
-            output_list = [{
-                    'input': f"<DESC>{row['tag_description']}<DESC>",
-                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
-            } for _, row in df.iterrows()]
-
-            return output_list
-
-        def _preprocess_function(example):
-            input = example['input']
-            target = example['output']
-            # text_target sets the corresponding label to inputs
-            # there is no need to create a separate 'labels'
-            model_inputs = self.tokenizer(
-                input,
-                text_target=target, 
-                max_length=max_length,
-                return_tensors="pt",
-                padding='max_length',
-                truncation=True,
-            )
-            return model_inputs
-
-        test_dataset = Dataset.from_list(_process_df(input_df))
-
-
-        # map maps function to each "row" in the dataset
-        # aka the data in the immediate nesting
-        datasets = test_dataset.map(
-            _preprocess_function,
-            batched=True,
-            num_proc=1,
-            remove_columns=test_dataset.column_names,
-        )
-        # datasets = _preprocess_function(test_dataset)
-        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
-
-        # create dataloader
-        self.dataloader = DataLoader(datasets, batch_size=batch_size)
-
-
-    def generate(self):
-        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
-        MAX_GENERATE_LENGTH = 128
-
-        pred_generations = []
-        pred_labels = []
-
-        print("start generation")
-        for batch in tqdm(self.dataloader):
-            # Inference in batches
-            input_ids = batch['input_ids']
-            attention_mask = batch['attention_mask']
-            # save labels too
-            pred_labels.extend(batch['labels'])
-            
-
-            # Move to GPU if available
-            input_ids = input_ids.to(device)
-            attention_mask = attention_mask.to(device)
-            self.model.to(device)
-
-            # Perform inference
-            with torch.no_grad():
-                outputs = self.model.generate(input_ids,
-                                        attention_mask=attention_mask,
-                                        max_length=MAX_GENERATE_LENGTH)
-                
-                # Decode the output and print the results
-                pred_generations.extend(outputs.to("cpu"))
-
-
-
-        # %%
-        # extract sequence and decode
-        def extract_seq(tokens, start_value, end_value):
-            if start_value not in tokens or end_value not in tokens:
-                return None  # Or handle this case according to your requirements
-            start_id = np.where(tokens == start_value)[0][0]
-            end_id = np.where(tokens == end_value)[0][0]
-
-            return tokens[start_id+1:end_id]
-
-
-        def process_tensor_output(tokens):
-            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
-            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
-            p_thing = None
-            p_property = None
-            if (thing_seq is not None):
-                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
-            if (property_seq is not None):
-                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
-            return p_thing, p_property
-
-        # decode prediction labels
-        def decode_preds(tokens_list):
-            thing_prediction_list = []
-            property_prediction_list = []
-            for tokens in tokens_list:
-                p_thing, p_property = process_tensor_output(tokens)
-                thing_prediction_list.append(p_thing)
-                property_prediction_list.append(p_property)
-            return thing_prediction_list, property_prediction_list 
-
-        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
-        return thing_prediction_list, property_prediction_list
-
--- a/train/README.md
+++ b/train/README.md
@ -0,0 +1,12 @@
+# Train
+
+## What is this folder
+
+Here contains the code for training and mapping evaluation.
+
+Each folder contains a training variation.
+
+After training, each folder contains the checkpoint files for each fold.
+
+`mapping` directory contains the code to run the model on test data and also
+produce the csv outputs.
--- a/test/selection/.gitignore
+++ b/test/selection/.gitignore
@ -1 +1,2 @@
 __pycache__
+exports/
--- a/train/mapping/inference.py
+++ b/train/mapping/inference.py
--- a/train/mapping/output.txt
+++ b/train/mapping/output.txt
--- a/train/mapping/predict.py
+++ b/train/mapping/predict.py
@ -47,7 +47,7 @@ def infer_and_select(fold):
    df = pd.concat([df, df_out], axis=1)

    # we can save the t5 generation output here
-    # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")
+    df.to_csv(f"exports/result_group_{fold}.csv")

    # here we want to evaluate mapping accuracy within the valid in mdm data only
    in_mdm = df['MDM']
--- a/translation/README.md
+++ b/translation/README.md
@ -0,0 +1,3 @@
+# translation
+
+These files were from the GRS paper. These codes will not be used.