Chore: moved selection to post_process, mapping to test

This commit is contained in:
Richard Wong 2024-10-31 16:35:28 +09:00
parent 16374b9ab8
commit 18e4a5f7df
14 changed files with 48 additions and 203 deletions

9
post_process/README.md Normal file
View File

@ -0,0 +1,9 @@
# Post-Processing
## What is this folder
This folder contains the files for post-processing.
We divide each processing method into their respective folders to modularize the
post-processing methods. This helps to make it easier to test different methods
and reduce coupling between stages.

View File

View File

@ -1,14 +1,13 @@
import pandas as pd import pandas as pd
import os import os
import glob import glob
from inference import Inference
# directory for checkpoints # directory for checkpoints
checkpoint_directory = '../../train/baseline' checkpoint_directory = '../../train/baseline'
def infer_and_select(fold): def select(fold):
# import test data # import test data
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" data_path = f"../../train/mapping/exports/result_group_{fold}.csv"
df = pd.read_csv(data_path, skipinitialspace=True) df = pd.read_csv(data_path, skipinitialspace=True)
# get target data # get target data
@ -18,37 +17,11 @@ def infer_and_select(fold):
train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
##########################################
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
infer = Inference(checkpoint_path)
infer.prepare_dataloader(df, batch_size=256, max_length=64)
thing_prediction_list, property_prediction_list = infer.generate()
# add labels too
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
# Convert the list to a Pandas DataFrame
df_out = pd.DataFrame({
'p_thing': thing_prediction_list,
'p_property': property_prediction_list
})
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
df = pd.concat([df, df_out], axis=1)
########################################## ##########################################
# Process the dataframe for selection # Process the dataframe for selection
# we start to cull predictions from here # we start to cull predictions from here
data_master_path = f"../../data_import/exports/data_model_master_export.csv" data_master_path = "../../data_import/exports/data_model_master_export.csv"
df_master = pd.read_csv(data_master_path, skipinitialspace=True) df_master = pd.read_csv(data_master_path, skipinitialspace=True)
data_mapping = df data_mapping = df
# Generate patterns # Generate patterns
@ -75,14 +48,14 @@ def infer_and_select(fold):
condition1 = df['MDM'] # condition1 = df['MDM']
condition2 = df['p_MDM'] # condition2 = df['p_MDM']
condition_correct_thing = df['p_thing'] == df['thing'] # condition_correct_thing = df['p_thing'] == df['thing']
condition_correct_property = df['p_property'] == df['property'] # condition_correct_property = df['p_property'] == df['property']
match = sum(condition1 & condition2) # match = sum(condition1 & condition2)
fn = sum(condition1 & ~condition2) # fn = sum(condition1 & ~condition2)
prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1) # prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
# print("mdm match predicted mdm: ", match) # 56 - false negative # print("mdm match predicted mdm: ", match) # 56 - false negative
# print("mdm but not predicted mdm: ", fn) # 56 - false negative # print("mdm but not predicted mdm: ", fn) # 56 - false negative
@ -101,6 +74,17 @@ def infer_and_select(fold):
import selection import selection
# importlib.reload(selection) # importlib.reload(selection)
selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold) selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
##########################################
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path) tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)
@ -126,4 +110,4 @@ with open("output.txt", "w") as f:
print('', file=f) print('', file=f)
for fold in [1,2,3,4,5]: for fold in [1,2,3,4,5]:
infer_and_select(fold) select(fold)

View File

@ -1,164 +0,0 @@
import torch
from torch.utils.data import DataLoader
from transformers import (
T5TokenizerFast,
AutoModelForSeq2SeqLM,
)
import glob
import os
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
import numpy as np
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
class Inference():
tokenizer: T5TokenizerFast
model: torch.nn.Module
dataloader: DataLoader
def __init__(self, checkpoint_path):
self._create_tokenizer()
self._load_model(checkpoint_path)
def _create_tokenizer(self):
# %%
# load tokenizer
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
# Add the additional special tokens to the tokenizer
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
def _load_model(self, checkpoint_path: str):
# load model
# Define the directory and the pattern
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model = torch.compile(model)
# set model to eval
self.model = model.eval()
def prepare_dataloader(self, input_df, batch_size, max_length):
"""
*arguments*
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
- batch_size: the batch size of dataloader output
- max_length: length of tokenizer output
"""
print("preparing dataloader")
# convert each dataframe row into a dictionary
# outputs a list of dictionaries
def _process_df(df):
output_list = [{
'input': f"<DESC>{row['tag_description']}<DESC>",
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
} for _, row in df.iterrows()]
return output_list
def _preprocess_function(example):
input = example['input']
target = example['output']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = self.tokenizer(
input,
text_target=target,
max_length=max_length,
return_tensors="pt",
padding='max_length',
truncation=True,
)
return model_inputs
test_dataset = Dataset.from_list(_process_df(input_df))
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
_preprocess_function,
batched=True,
num_proc=1,
remove_columns=test_dataset.column_names,
)
# datasets = _preprocess_function(test_dataset)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# create dataloader
self.dataloader = DataLoader(datasets, batch_size=batch_size)
def generate(self):
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128
pred_generations = []
pred_labels = []
print("start generation")
for batch in tqdm(self.dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
pred_labels.extend(batch['labels'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
self.model.to(device)
# Perform inference
with torch.no_grad():
outputs = self.model.generate(input_ids,
attention_mask=attention_mask,
max_length=MAX_GENERATE_LENGTH)
# Decode the output and print the results
pred_generations.extend(outputs.to("cpu"))
# %%
# extract sequence and decode
def extract_seq(tokens, start_value, end_value):
if start_value not in tokens or end_value not in tokens:
return None # Or handle this case according to your requirements
start_id = np.where(tokens == start_value)[0][0]
end_id = np.where(tokens == end_value)[0][0]
return tokens[start_id+1:end_id]
def process_tensor_output(tokens):
thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
p_thing = None
p_property = None
if (thing_seq is not None):
p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False)
if (property_seq is not None):
p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False)
return p_thing, p_property
# decode prediction labels
def decode_preds(tokens_list):
thing_prediction_list = []
property_prediction_list = []
for tokens in tokens_list:
p_thing, p_property = process_tensor_output(tokens)
thing_prediction_list.append(p_thing)
property_prediction_list.append(p_property)
return thing_prediction_list, property_prediction_list
thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
return thing_prediction_list, property_prediction_list

12
train/README.md Normal file
View File

@ -0,0 +1,12 @@
# Train
## What is this folder
Here contains the code for training and mapping evaluation.
Each folder contains a training variation.
After training, each folder contains the checkpoint files for each fold.
`mapping` directory contains the code to run the model on test data and also
produce the csv outputs.

View File

@ -1 +1,2 @@
__pycache__ __pycache__
exports/

View File

@ -47,7 +47,7 @@ def infer_and_select(fold):
df = pd.concat([df, df_out], axis=1) df = pd.concat([df, df_out], axis=1)
# we can save the t5 generation output here # we can save the t5 generation output here
# df.to_parquet(f"exports/fold_{fold}/t5_output.parquet") df.to_csv(f"exports/result_group_{fold}.csv")
# here we want to evaluate mapping accuracy within the valid in mdm data only # here we want to evaluate mapping accuracy within the valid in mdm data only
in_mdm = df['MDM'] in_mdm = df['MDM']

3
translation/README.md Normal file
View File

@ -0,0 +1,3 @@
# translation
These files were from the GRS paper. These codes will not be used.