Chore: moved selection to post_process, mapping to test
This commit is contained in:
parent
16374b9ab8
commit
18e4a5f7df
|
@ -0,0 +1,9 @@
|
|||
# Post-Processing
|
||||
|
||||
## What is this folder
|
||||
|
||||
This folder contains the files for post-processing.
|
||||
|
||||
We divide each processing method into their respective folders to modularize the
|
||||
post-processing methods. This helps to make it easier to test different methods
|
||||
and reduce coupling between stages.
|
|
@ -1,14 +1,13 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import glob
|
||||
from inference import Inference
|
||||
|
||||
# directory for checkpoints
|
||||
checkpoint_directory = '../../train/baseline'
|
||||
|
||||
def infer_and_select(fold):
|
||||
def select(fold):
|
||||
# import test data
|
||||
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||
data_path = f"../../train/mapping/exports/result_group_{fold}.csv"
|
||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
# get target data
|
||||
|
@ -18,37 +17,11 @@ def infer_and_select(fold):
|
|||
train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
|
||||
|
||||
|
||||
##########################################
|
||||
# run inference
|
||||
# checkpoint
|
||||
# Use glob to find matching paths
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||
|
||||
infer = Inference(checkpoint_path)
|
||||
infer.prepare_dataloader(df, batch_size=256, max_length=64)
|
||||
thing_prediction_list, property_prediction_list = infer.generate()
|
||||
|
||||
# add labels too
|
||||
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
||||
# Convert the list to a Pandas DataFrame
|
||||
df_out = pd.DataFrame({
|
||||
'p_thing': thing_prediction_list,
|
||||
'p_property': property_prediction_list
|
||||
})
|
||||
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
||||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
##########################################
|
||||
# Process the dataframe for selection
|
||||
|
||||
# we start to cull predictions from here
|
||||
data_master_path = f"../../data_import/exports/data_model_master_export.csv"
|
||||
data_master_path = "../../data_import/exports/data_model_master_export.csv"
|
||||
df_master = pd.read_csv(data_master_path, skipinitialspace=True)
|
||||
data_mapping = df
|
||||
# Generate patterns
|
||||
|
@ -75,14 +48,14 @@ def infer_and_select(fold):
|
|||
|
||||
|
||||
|
||||
condition1 = df['MDM']
|
||||
condition2 = df['p_MDM']
|
||||
# condition1 = df['MDM']
|
||||
# condition2 = df['p_MDM']
|
||||
|
||||
condition_correct_thing = df['p_thing'] == df['thing']
|
||||
condition_correct_property = df['p_property'] == df['property']
|
||||
match = sum(condition1 & condition2)
|
||||
fn = sum(condition1 & ~condition2)
|
||||
prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
|
||||
# condition_correct_thing = df['p_thing'] == df['thing']
|
||||
# condition_correct_property = df['p_property'] == df['property']
|
||||
# match = sum(condition1 & condition2)
|
||||
# fn = sum(condition1 & ~condition2)
|
||||
# prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
|
||||
|
||||
# print("mdm match predicted mdm: ", match) # 56 - false negative
|
||||
# print("mdm but not predicted mdm: ", fn) # 56 - false negative
|
||||
|
@ -101,6 +74,17 @@ def infer_and_select(fold):
|
|||
import selection
|
||||
# importlib.reload(selection)
|
||||
selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
|
||||
|
||||
##########################################
|
||||
# run inference
|
||||
# checkpoint
|
||||
# Use glob to find matching paths
|
||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||
# Use glob to find matching paths
|
||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||
# we are guaranteed to save only 1 checkpoint from training
|
||||
pattern = 'checkpoint-*'
|
||||
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||
tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)
|
||||
|
||||
|
||||
|
@ -126,4 +110,4 @@ with open("output.txt", "w") as f:
|
|||
print('', file=f)
|
||||
|
||||
for fold in [1,2,3,4,5]:
|
||||
infer_and_select(fold)
|
||||
select(fold)
|
|
@ -1,164 +0,0 @@
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import (
|
||||
T5TokenizerFast,
|
||||
AutoModelForSeq2SeqLM,
|
||||
)
|
||||
import glob
|
||||
import os
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from datasets import Dataset
|
||||
import numpy as np
|
||||
|
||||
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
||||
|
||||
|
||||
class Inference():
|
||||
tokenizer: T5TokenizerFast
|
||||
model: torch.nn.Module
|
||||
dataloader: DataLoader
|
||||
|
||||
def __init__(self, checkpoint_path):
|
||||
self._create_tokenizer()
|
||||
self._load_model(checkpoint_path)
|
||||
|
||||
|
||||
def _create_tokenizer(self):
|
||||
# %%
|
||||
# load tokenizer
|
||||
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
|
||||
# Add the additional special tokens to the tokenizer
|
||||
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
def _load_model(self, checkpoint_path: str):
|
||||
# load model
|
||||
# Define the directory and the pattern
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
|
||||
model = torch.compile(model)
|
||||
# set model to eval
|
||||
self.model = model.eval()
|
||||
|
||||
|
||||
|
||||
|
||||
def prepare_dataloader(self, input_df, batch_size, max_length):
|
||||
"""
|
||||
*arguments*
|
||||
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
|
||||
- batch_size: the batch size of dataloader output
|
||||
- max_length: length of tokenizer output
|
||||
"""
|
||||
print("preparing dataloader")
|
||||
# convert each dataframe row into a dictionary
|
||||
# outputs a list of dictionaries
|
||||
def _process_df(df):
|
||||
output_list = [{
|
||||
'input': f"<DESC>{row['tag_description']}<DESC>",
|
||||
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
||||
} for _, row in df.iterrows()]
|
||||
|
||||
return output_list
|
||||
|
||||
def _preprocess_function(example):
|
||||
input = example['input']
|
||||
target = example['output']
|
||||
# text_target sets the corresponding label to inputs
|
||||
# there is no need to create a separate 'labels'
|
||||
model_inputs = self.tokenizer(
|
||||
input,
|
||||
text_target=target,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
)
|
||||
return model_inputs
|
||||
|
||||
test_dataset = Dataset.from_list(_process_df(input_df))
|
||||
|
||||
|
||||
# map maps function to each "row" in the dataset
|
||||
# aka the data in the immediate nesting
|
||||
datasets = test_dataset.map(
|
||||
_preprocess_function,
|
||||
batched=True,
|
||||
num_proc=1,
|
||||
remove_columns=test_dataset.column_names,
|
||||
)
|
||||
# datasets = _preprocess_function(test_dataset)
|
||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
||||
|
||||
# create dataloader
|
||||
self.dataloader = DataLoader(datasets, batch_size=batch_size)
|
||||
|
||||
|
||||
def generate(self):
|
||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
||||
MAX_GENERATE_LENGTH = 128
|
||||
|
||||
pred_generations = []
|
||||
pred_labels = []
|
||||
|
||||
print("start generation")
|
||||
for batch in tqdm(self.dataloader):
|
||||
# Inference in batches
|
||||
input_ids = batch['input_ids']
|
||||
attention_mask = batch['attention_mask']
|
||||
# save labels too
|
||||
pred_labels.extend(batch['labels'])
|
||||
|
||||
|
||||
# Move to GPU if available
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
self.model.to(device)
|
||||
|
||||
# Perform inference
|
||||
with torch.no_grad():
|
||||
outputs = self.model.generate(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_length=MAX_GENERATE_LENGTH)
|
||||
|
||||
# Decode the output and print the results
|
||||
pred_generations.extend(outputs.to("cpu"))
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# extract sequence and decode
|
||||
def extract_seq(tokens, start_value, end_value):
|
||||
if start_value not in tokens or end_value not in tokens:
|
||||
return None # Or handle this case according to your requirements
|
||||
start_id = np.where(tokens == start_value)[0][0]
|
||||
end_id = np.where(tokens == end_value)[0][0]
|
||||
|
||||
return tokens[start_id+1:end_id]
|
||||
|
||||
|
||||
def process_tensor_output(tokens):
|
||||
thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
|
||||
property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
|
||||
p_thing = None
|
||||
p_property = None
|
||||
if (thing_seq is not None):
|
||||
p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False)
|
||||
if (property_seq is not None):
|
||||
p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False)
|
||||
return p_thing, p_property
|
||||
|
||||
# decode prediction labels
|
||||
def decode_preds(tokens_list):
|
||||
thing_prediction_list = []
|
||||
property_prediction_list = []
|
||||
for tokens in tokens_list:
|
||||
p_thing, p_property = process_tensor_output(tokens)
|
||||
thing_prediction_list.append(p_thing)
|
||||
property_prediction_list.append(p_property)
|
||||
return thing_prediction_list, property_prediction_list
|
||||
|
||||
thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
|
||||
return thing_prediction_list, property_prediction_list
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
# Train
|
||||
|
||||
## What is this folder
|
||||
|
||||
Here contains the code for training and mapping evaluation.
|
||||
|
||||
Each folder contains a training variation.
|
||||
|
||||
After training, each folder contains the checkpoint files for each fold.
|
||||
|
||||
`mapping` directory contains the code to run the model on test data and also
|
||||
produce the csv outputs.
|
|
@ -1 +1,2 @@
|
|||
__pycache__
|
||||
exports/
|
|
@ -47,7 +47,7 @@ def infer_and_select(fold):
|
|||
df = pd.concat([df, df_out], axis=1)
|
||||
|
||||
# we can save the t5 generation output here
|
||||
# df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")
|
||||
df.to_csv(f"exports/result_group_{fold}.csv")
|
||||
|
||||
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
||||
in_mdm = df['MDM']
|
|
@ -0,0 +1,3 @@
|
|||
# translation
|
||||
|
||||
These files were from the GRS paper. These codes will not be used.
|
Loading…
Reference in New Issue