Chore: moved selection to post_process, mapping to test
This commit is contained in:
parent
16374b9ab8
commit
18e4a5f7df
|
@ -0,0 +1,9 @@
|
||||||
|
# Post-Processing
|
||||||
|
|
||||||
|
## What is this folder
|
||||||
|
|
||||||
|
This folder contains the files for post-processing.
|
||||||
|
|
||||||
|
We divide each processing method into their respective folders to modularize the
|
||||||
|
post-processing methods. This helps to make it easier to test different methods
|
||||||
|
and reduce coupling between stages.
|
|
@ -1,14 +1,13 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import glob
|
import glob
|
||||||
from inference import Inference
|
|
||||||
|
|
||||||
# directory for checkpoints
|
# directory for checkpoints
|
||||||
checkpoint_directory = '../../train/baseline'
|
checkpoint_directory = '../../train/baseline'
|
||||||
|
|
||||||
def infer_and_select(fold):
|
def select(fold):
|
||||||
# import test data
|
# import test data
|
||||||
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
data_path = f"../../train/mapping/exports/result_group_{fold}.csv"
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
# get target data
|
# get target data
|
||||||
|
@ -18,37 +17,11 @@ def infer_and_select(fold):
|
||||||
train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
|
train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
|
||||||
|
|
||||||
|
|
||||||
##########################################
|
|
||||||
# run inference
|
|
||||||
# checkpoint
|
|
||||||
# Use glob to find matching paths
|
|
||||||
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
|
||||||
# Use glob to find matching paths
|
|
||||||
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
|
||||||
# we are guaranteed to save only 1 checkpoint from training
|
|
||||||
pattern = 'checkpoint-*'
|
|
||||||
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
|
||||||
|
|
||||||
infer = Inference(checkpoint_path)
|
|
||||||
infer.prepare_dataloader(df, batch_size=256, max_length=64)
|
|
||||||
thing_prediction_list, property_prediction_list = infer.generate()
|
|
||||||
|
|
||||||
# add labels too
|
|
||||||
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
|
||||||
# Convert the list to a Pandas DataFrame
|
|
||||||
df_out = pd.DataFrame({
|
|
||||||
'p_thing': thing_prediction_list,
|
|
||||||
'p_property': property_prediction_list
|
|
||||||
})
|
|
||||||
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
|
||||||
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
|
||||||
df = pd.concat([df, df_out], axis=1)
|
|
||||||
|
|
||||||
##########################################
|
##########################################
|
||||||
# Process the dataframe for selection
|
# Process the dataframe for selection
|
||||||
|
|
||||||
# we start to cull predictions from here
|
# we start to cull predictions from here
|
||||||
data_master_path = f"../../data_import/exports/data_model_master_export.csv"
|
data_master_path = "../../data_import/exports/data_model_master_export.csv"
|
||||||
df_master = pd.read_csv(data_master_path, skipinitialspace=True)
|
df_master = pd.read_csv(data_master_path, skipinitialspace=True)
|
||||||
data_mapping = df
|
data_mapping = df
|
||||||
# Generate patterns
|
# Generate patterns
|
||||||
|
@ -75,14 +48,14 @@ def infer_and_select(fold):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
condition1 = df['MDM']
|
# condition1 = df['MDM']
|
||||||
condition2 = df['p_MDM']
|
# condition2 = df['p_MDM']
|
||||||
|
|
||||||
condition_correct_thing = df['p_thing'] == df['thing']
|
# condition_correct_thing = df['p_thing'] == df['thing']
|
||||||
condition_correct_property = df['p_property'] == df['property']
|
# condition_correct_property = df['p_property'] == df['property']
|
||||||
match = sum(condition1 & condition2)
|
# match = sum(condition1 & condition2)
|
||||||
fn = sum(condition1 & ~condition2)
|
# fn = sum(condition1 & ~condition2)
|
||||||
prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
|
# prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)
|
||||||
|
|
||||||
# print("mdm match predicted mdm: ", match) # 56 - false negative
|
# print("mdm match predicted mdm: ", match) # 56 - false negative
|
||||||
# print("mdm but not predicted mdm: ", fn) # 56 - false negative
|
# print("mdm but not predicted mdm: ", fn) # 56 - false negative
|
||||||
|
@ -101,6 +74,17 @@ def infer_and_select(fold):
|
||||||
import selection
|
import selection
|
||||||
# importlib.reload(selection)
|
# importlib.reload(selection)
|
||||||
selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
|
selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# run inference
|
||||||
|
# checkpoint
|
||||||
|
# Use glob to find matching paths
|
||||||
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint-*'
|
||||||
|
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||||
tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)
|
tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)
|
||||||
|
|
||||||
|
|
||||||
|
@ -126,4 +110,4 @@ with open("output.txt", "w") as f:
|
||||||
print('', file=f)
|
print('', file=f)
|
||||||
|
|
||||||
for fold in [1,2,3,4,5]:
|
for fold in [1,2,3,4,5]:
|
||||||
infer_and_select(fold)
|
select(fold)
|
|
@ -1,164 +0,0 @@
|
||||||
import torch
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
from transformers import (
|
|
||||||
T5TokenizerFast,
|
|
||||||
AutoModelForSeq2SeqLM,
|
|
||||||
)
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
|
||||||
from datasets import Dataset
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
|
||||||
|
|
||||||
|
|
||||||
class Inference():
|
|
||||||
tokenizer: T5TokenizerFast
|
|
||||||
model: torch.nn.Module
|
|
||||||
dataloader: DataLoader
|
|
||||||
|
|
||||||
def __init__(self, checkpoint_path):
|
|
||||||
self._create_tokenizer()
|
|
||||||
self._load_model(checkpoint_path)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_tokenizer(self):
|
|
||||||
# %%
|
|
||||||
# load tokenizer
|
|
||||||
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
|
|
||||||
# Define additional special tokens
|
|
||||||
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
|
|
||||||
# Add the additional special tokens to the tokenizer
|
|
||||||
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
|
||||||
|
|
||||||
def _load_model(self, checkpoint_path: str):
|
|
||||||
# load model
|
|
||||||
# Define the directory and the pattern
|
|
||||||
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
|
|
||||||
model = torch.compile(model)
|
|
||||||
# set model to eval
|
|
||||||
self.model = model.eval()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataloader(self, input_df, batch_size, max_length):
|
|
||||||
"""
|
|
||||||
*arguments*
|
|
||||||
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
|
|
||||||
- batch_size: the batch size of dataloader output
|
|
||||||
- max_length: length of tokenizer output
|
|
||||||
"""
|
|
||||||
print("preparing dataloader")
|
|
||||||
# convert each dataframe row into a dictionary
|
|
||||||
# outputs a list of dictionaries
|
|
||||||
def _process_df(df):
|
|
||||||
output_list = [{
|
|
||||||
'input': f"<DESC>{row['tag_description']}<DESC>",
|
|
||||||
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
|
||||||
} for _, row in df.iterrows()]
|
|
||||||
|
|
||||||
return output_list
|
|
||||||
|
|
||||||
def _preprocess_function(example):
|
|
||||||
input = example['input']
|
|
||||||
target = example['output']
|
|
||||||
# text_target sets the corresponding label to inputs
|
|
||||||
# there is no need to create a separate 'labels'
|
|
||||||
model_inputs = self.tokenizer(
|
|
||||||
input,
|
|
||||||
text_target=target,
|
|
||||||
max_length=max_length,
|
|
||||||
return_tensors="pt",
|
|
||||||
padding='max_length',
|
|
||||||
truncation=True,
|
|
||||||
)
|
|
||||||
return model_inputs
|
|
||||||
|
|
||||||
test_dataset = Dataset.from_list(_process_df(input_df))
|
|
||||||
|
|
||||||
|
|
||||||
# map maps function to each "row" in the dataset
|
|
||||||
# aka the data in the immediate nesting
|
|
||||||
datasets = test_dataset.map(
|
|
||||||
_preprocess_function,
|
|
||||||
batched=True,
|
|
||||||
num_proc=1,
|
|
||||||
remove_columns=test_dataset.column_names,
|
|
||||||
)
|
|
||||||
# datasets = _preprocess_function(test_dataset)
|
|
||||||
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
|
||||||
|
|
||||||
# create dataloader
|
|
||||||
self.dataloader = DataLoader(datasets, batch_size=batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
def generate(self):
|
|
||||||
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
|
|
||||||
MAX_GENERATE_LENGTH = 128
|
|
||||||
|
|
||||||
pred_generations = []
|
|
||||||
pred_labels = []
|
|
||||||
|
|
||||||
print("start generation")
|
|
||||||
for batch in tqdm(self.dataloader):
|
|
||||||
# Inference in batches
|
|
||||||
input_ids = batch['input_ids']
|
|
||||||
attention_mask = batch['attention_mask']
|
|
||||||
# save labels too
|
|
||||||
pred_labels.extend(batch['labels'])
|
|
||||||
|
|
||||||
|
|
||||||
# Move to GPU if available
|
|
||||||
input_ids = input_ids.to(device)
|
|
||||||
attention_mask = attention_mask.to(device)
|
|
||||||
self.model.to(device)
|
|
||||||
|
|
||||||
# Perform inference
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = self.model.generate(input_ids,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
max_length=MAX_GENERATE_LENGTH)
|
|
||||||
|
|
||||||
# Decode the output and print the results
|
|
||||||
pred_generations.extend(outputs.to("cpu"))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
# extract sequence and decode
|
|
||||||
def extract_seq(tokens, start_value, end_value):
|
|
||||||
if start_value not in tokens or end_value not in tokens:
|
|
||||||
return None # Or handle this case according to your requirements
|
|
||||||
start_id = np.where(tokens == start_value)[0][0]
|
|
||||||
end_id = np.where(tokens == end_value)[0][0]
|
|
||||||
|
|
||||||
return tokens[start_id+1:end_id]
|
|
||||||
|
|
||||||
|
|
||||||
def process_tensor_output(tokens):
|
|
||||||
thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
|
|
||||||
property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
|
|
||||||
p_thing = None
|
|
||||||
p_property = None
|
|
||||||
if (thing_seq is not None):
|
|
||||||
p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False)
|
|
||||||
if (property_seq is not None):
|
|
||||||
p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False)
|
|
||||||
return p_thing, p_property
|
|
||||||
|
|
||||||
# decode prediction labels
|
|
||||||
def decode_preds(tokens_list):
|
|
||||||
thing_prediction_list = []
|
|
||||||
property_prediction_list = []
|
|
||||||
for tokens in tokens_list:
|
|
||||||
p_thing, p_property = process_tensor_output(tokens)
|
|
||||||
thing_prediction_list.append(p_thing)
|
|
||||||
property_prediction_list.append(p_property)
|
|
||||||
return thing_prediction_list, property_prediction_list
|
|
||||||
|
|
||||||
thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
|
|
||||||
return thing_prediction_list, property_prediction_list
|
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
# Train
|
||||||
|
|
||||||
|
## What is this folder
|
||||||
|
|
||||||
|
Here contains the code for training and mapping evaluation.
|
||||||
|
|
||||||
|
Each folder contains a training variation.
|
||||||
|
|
||||||
|
After training, each folder contains the checkpoint files for each fold.
|
||||||
|
|
||||||
|
`mapping` directory contains the code to run the model on test data and also
|
||||||
|
produce the csv outputs.
|
|
@ -1 +1,2 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
|
exports/
|
|
@ -47,7 +47,7 @@ def infer_and_select(fold):
|
||||||
df = pd.concat([df, df_out], axis=1)
|
df = pd.concat([df, df_out], axis=1)
|
||||||
|
|
||||||
# we can save the t5 generation output here
|
# we can save the t5 generation output here
|
||||||
# df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")
|
df.to_csv(f"exports/result_group_{fold}.csv")
|
||||||
|
|
||||||
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
# here we want to evaluate mapping accuracy within the valid in mdm data only
|
||||||
in_mdm = df['MDM']
|
in_mdm = df['MDM']
|
|
@ -0,0 +1,3 @@
|
||||||
|
# translation
|
||||||
|
|
||||||
|
These files were from the GRS paper. These codes will not be used.
|
Loading…
Reference in New Issue