Feat: implement hybrid fine-tuning of encoder and decoder networks

separately
This commit is contained in:
Richard Wong 2024-12-10 23:40:10 +09:00
parent 446ed1429c
commit b01ca4f395
14 changed files with 924 additions and 26 deletions

1
.gitignore vendored
View File

@ -0,0 +1 @@
*.zip

View File

@ -1,4 +1,5 @@
models models
raw_data.csv raw_data.csv
train_all.csv train_all.csv
__pycache__ __pycache__
*.pt

View File

@ -31,8 +31,9 @@ class BertEmbedder:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu" # device = "cpu"
model.to(self.device) self.model = model.to(self.device)
self.model = model.eval() self.model = self.model.eval()
# self.model = torch.compile(self.model)
def make_embedding(self, batch_size=128): def make_embedding(self, batch_size=128):
@ -75,10 +76,11 @@ class T5Embedder:
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu" # device = "cpu"
model.to(self.device) model.to(self.device)
self.model = model.eval() self.model = model.eval()
self.model = torch.compile(self.model)
@ -251,10 +253,26 @@ def run_deduplication(
# generate your embeddings # generate your embeddings
checkpoint_path = 'models/bert_model' checkpoint_path = 'models/bert_model'
# cache embeddings
file_path = "train_embeds.pt"
if os.path.exists(file_path):
# Load the tensor if the file exists
tensor = torch.load(file_path, weights_only=True)
print("Loaded tensor")
else:
# Create and save the tensor if the file doesn't exist
print('generate train embeddings')
train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
tensor = train_embedder.make_embedding(checkpoint_path)
torch.save(tensor, file_path, weights_only=True)
print("Tensor saved to file.")
train_embeds = tensor
# if we can, we can cache the train embeddings and load directly
# we can generate the train embeddings once and re-use for every ship # we can generate the train embeddings once and re-use for every ship
print('generate train embeddings')
train_embedder = Embedder(input_df=train_df, batch_size=batch_size)
train_embeds = train_embedder.make_embedding(checkpoint_path)
# generate new embeddings for each ship # generate new embeddings for each ship
print('generate test embeddings') print('generate test embeddings')

View File

@ -3,11 +3,14 @@ from torch.utils.data import DataLoader
from transformers import ( from transformers import (
T5TokenizerFast, T5TokenizerFast,
AutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM,
StoppingCriteria,
StoppingCriteriaList,
) )
import os import os
from tqdm import tqdm from tqdm import tqdm
from datasets import Dataset from datasets import Dataset
import numpy as np import numpy as np
from torch.nn.utils.rnn import pad_sequence
os.environ['TOKENIZERS_PARALLELISM'] = 'false' os.environ['TOKENIZERS_PARALLELISM'] = 'false'
@ -35,9 +38,11 @@ class Mapper():
# load model # load model
# Define the directory and the pattern # Define the directory and the pattern
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model = torch.compile(model)
# set model to eval # set model to eval
self.model = model.eval() self.model = model.eval()
self.model.cuda()
# self.model = torch.compile(self.model)
@ -59,8 +64,8 @@ class Mapper():
desc = f"<DESC>{row['tag_description']}<DESC>" desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>" unit = f"<UNIT>{row['unit']}<UNIT>"
element = { element = {
'input' : f"{desc}{unit}", 'input' : f"{desc}{unit}"
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>", # 'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
} }
output_list.append(element) output_list.append(element)
@ -68,16 +73,16 @@ class Mapper():
def _preprocess_function(example): def _preprocess_function(example):
input = example['input'] input = example['input']
target = example['output'] # target = example['output']
# text_target sets the corresponding label to inputs # text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels' # there is no need to create a separate 'labels'
model_inputs = self.tokenizer( model_inputs = self.tokenizer(
input, input,
text_target=target, # text_target=target,
max_length=max_length, # max_length=max_length,
return_tensors="pt", padding=True,
padding='max_length',
truncation=True, truncation=True,
return_tensors="pt",
) )
return model_inputs return model_inputs
@ -93,19 +98,55 @@ class Mapper():
remove_columns=test_dataset.column_names, remove_columns=test_dataset.column_names,
) )
# datasets = _preprocess_function(test_dataset) # datasets = _preprocess_function(test_dataset)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])
def _custom_collate_fn(batch):
# Extract data and targets separately if needed
inputs = [item['input_ids'] for item in batch]
attention_masks = [item['attention_mask'] for item in batch]
# Pad data to the same length
padded_inputs = pad_sequence(inputs, batch_first=True)
padded_attention_masks = pad_sequence(attention_masks, batch_first=True)
return {'input_ids': padded_inputs, 'attention_mask': padded_attention_masks}
# create dataloader # create dataloader
self.dataloader = DataLoader(datasets, batch_size=batch_size) self.dataloader = DataLoader(
datasets,
batch_size=batch_size,
collate_fn=_custom_collate_fn
)
def generate(self): def generate(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128 MAX_GENERATE_LENGTH = 120
pred_generations = [] pred_generations = []
pred_labels = [] # pred_labels = []
self.model.cuda() # self.model already assigned to device
# self.model.cuda()
# introduce early stopping so that it doesn't have to generate max length
class StopOnEndToken(StoppingCriteria):
def __init__(self, end_token_id):
self.end_token_id = end_token_id
def __call__(self, input_ids, scores, **kwargs):
# Check if the last token in any sequence is the end token
batch_stopped = input_ids[:, -1] == self.end_token_id
# only stop if all have reached end token
if batch_stopped.all():
return True # Stop generation for the entire batch
return False
# Define the end token ID (e.g., the ID for <eos>)
end_token_id = 32103 # property end token
# Use the stopping criteria
stopping_criteria = StoppingCriteriaList([StopOnEndToken(end_token_id)])
print("start generation") print("start generation")
for batch in tqdm(self.dataloader): for batch in tqdm(self.dataloader):
@ -113,7 +154,7 @@ class Mapper():
input_ids = batch['input_ids'] input_ids = batch['input_ids']
attention_mask = batch['attention_mask'] attention_mask = batch['attention_mask']
# save labels too # save labels too
pred_labels.extend(batch['labels']) # pred_labels.extend(batch['labels'])
# Move to GPU if available # Move to GPU if available
@ -126,7 +167,11 @@ class Mapper():
with torch.no_grad(): with torch.no_grad():
outputs = self.model.generate(input_ids, outputs = self.model.generate(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
max_length=MAX_GENERATE_LENGTH) max_length=MAX_GENERATE_LENGTH,
# eos_token_id=32103,
# early_stopping=True,
use_cache=True,
stopping_criteria=stopping_criteria)
# Decode the output and print the results # Decode the output and print the results
pred_generations.extend(outputs.to("cpu")) pred_generations.extend(outputs.to("cpu"))

View File

@ -7,8 +7,8 @@ from preprocess import Abbreviator
from deduplication import run_deduplication from deduplication import run_deduplication
# global config # global config
BATCH_SIZE = 1024 BATCH_SIZE = 512
SHIPS_LIST = [1000,1001,1003] SHIPS_LIST = [1000,1001,1003,1004]
# %% # %%
# START: we import the raw data csv and extract only a few ships from it to simulate incoming json # START: we import the raw data csv and extract only a few ships from it to simulate incoming json
@ -17,7 +17,8 @@ full_df = pd.read_csv(data_path, skipinitialspace=True)
# subset ships only to that found in SHIPS_LIST # subset ships only to that found in SHIPS_LIST
df = full_df[full_df['ships_idx'].isin(SHIPS_LIST)].reset_index(drop=True) df = full_df[full_df['ships_idx'].isin(SHIPS_LIST)].reset_index(drop=True)
num_rows = 2000 # test parameters
num_rows = len(df) - 1
df = df[:num_rows] df = df[:num_rows]
print(len(df)) print(len(df))
@ -58,7 +59,7 @@ df = run_deduplication(
test_df=df, test_df=df,
train_df=train_df, train_df=train_df,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
threshold=0.9, threshold=0.85,
diagnostic=True) diagnostic=True)
# %% # %%

View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log

View File

@ -0,0 +1 @@
__pycache__

View File

@ -0,0 +1,125 @@
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import (
T5PreTrainedModel,
T5Model
)
from transformers.modeling_outputs import (
SequenceClassifierOutput,
)
def mean_pooling(encoder_outputs, attention_mask):
"""
Perform mean pooling over encoder outputs, considering the attention mask.
"""
hidden_states = encoder_outputs.last_hidden_state # Shape: (batch_size, seq_length, hidden_size)
mask = attention_mask.unsqueeze(-1) # Shape: (batch_size, seq_length, 1)
masked_hidden_states = hidden_states * mask # Zero out padding tokens
sum_hidden_states = masked_hidden_states.sum(dim=1) # Sum over sequence length
sum_mask = mask.sum(dim=1) # Sum the mask (number of non-padding tokens)
return sum_hidden_states / sum_mask # Mean pooling
class T5EncoderForSequenceClassification(T5PreTrainedModel):
def __init__(self, checkpoint, tokenizer, config, num_labels):
super().__init__(config)
self.num_labels = num_labels
self.config = config
# we force the loading of a pre-trained model here
self.t5 = T5Model.from_pretrained(checkpoint)
self.t5.resize_token_embeddings(len(tokenizer))
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# encoder_outputs = self.t5.encoder(
# input_ids,
# attention_mask=attention_mask,
# head_mask=head_mask,
# inputs_embeds=inputs_embeds,
# output_attentions=output_attentions,
# output_hidden_states=output_hidden_states,
# return_dict=return_dict,
# )
encoder_outputs = self.t5.encoder(input_ids, attention_mask=attention_mask)
# last_hidden_state = encoder_outputs.last_hidden_state
# use mean of hidden state
# pooled_output = mean_pooling(encoder_outputs, attention_mask)
# Use the hidden state of the first token as the sequence representation
pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
# pooled_output = encoder_outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + encoder_outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)

View File

@ -0,0 +1,2 @@
__pycache__
exports/

View File

@ -0,0 +1,168 @@
import torch
from torch.utils.data import DataLoader
from transformers import (
T5TokenizerFast,
AutoModelForSeq2SeqLM,
)
import os
from tqdm import tqdm
from datasets import Dataset
import numpy as np
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
class Inference():
tokenizer: T5TokenizerFast
model: torch.nn.Module
dataloader: DataLoader
def __init__(self, checkpoint_path):
self._create_tokenizer()
self._load_model(checkpoint_path)
def _create_tokenizer(self):
# %%
# load tokenizer
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
# Add the additional special tokens to the tokenizer
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
def _load_model(self, checkpoint_path: str):
# load model
# Define the directory and the pattern
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model = torch.compile(model)
# set model to eval
self.model = model.eval()
def prepare_dataloader(self, input_df, batch_size, max_length):
"""
*arguments*
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
- batch_size: the batch size of dataloader output
- max_length: length of tokenizer output
"""
print("preparing dataloader")
# convert each dataframe row into a dictionary
# outputs a list of dictionaries
def _process_df(df):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
}
output_list.append(element)
return output_list
def _preprocess_function(example):
input = example['input']
target = example['output']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = self.tokenizer(
input,
text_target=target,
max_length=max_length,
return_tensors="pt",
padding="max_length",
truncation=True,
)
return model_inputs
test_dataset = Dataset.from_list(_process_df(input_df))
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
_preprocess_function,
batched=True,
num_proc=1,
remove_columns=test_dataset.column_names,
)
# datasets = _preprocess_function(test_dataset)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# create dataloader
self.dataloader = DataLoader(datasets, batch_size=batch_size)
def generate(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128
pred_generations = []
pred_labels = []
print("start generation")
for batch in tqdm(self.dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
pred_labels.extend(batch['labels'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
self.model.to(device)
# Perform inference
with torch.no_grad():
outputs = self.model.generate(input_ids,
attention_mask=attention_mask,
max_length=MAX_GENERATE_LENGTH)
# Decode the output and print the results
pred_generations.extend(outputs.to("cpu"))
# %%
# extract sequence and decode
def extract_seq(tokens, start_value, end_value):
if start_value not in tokens or end_value not in tokens:
return None # Or handle this case according to your requirements
start_id = np.where(tokens == start_value)[0][0]
end_id = np.where(tokens == end_value)[0][0]
return tokens[start_id+1:end_id]
def process_tensor_output(tokens):
thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
p_thing = None
p_property = None
if (thing_seq is not None):
p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False)
if (property_seq is not None):
p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False)
return p_thing, p_property
# decode prediction labels
def decode_preds(tokens_list):
thing_prediction_list = []
property_prediction_list = []
for tokens in tokens_list:
p_thing, p_property = process_tensor_output(tokens)
thing_prediction_list.append(p_thing)
property_prediction_list.append(p_property)
return thing_prediction_list, property_prediction_list
thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
return thing_prediction_list, property_prediction_list

View File

@ -0,0 +1,6 @@
Accuracy for fold 1: 0.9427354472314246
Accuracy for fold 2: 0.8859813084112149
Accuracy for fold 3: 0.9683734939759037
Accuracy for fold 4: 0.9762131303520457
Accuracy for fold 5: 0.907924874026569

View File

@ -0,0 +1,73 @@
import pandas as pd
import os
import glob
from inference import Inference
checkpoint_directory = '../'
BATCH_SIZE = 512
def infer_and_select(fold):
print(f"Inference for fold {fold}")
# import test data
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
# get target data
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# processing to help with selection later
train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
##########################################
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
infer = Inference(checkpoint_path)
infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
thing_prediction_list, property_prediction_list = infer.generate()
# add labels too
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
# Convert the list to a Pandas DataFrame
df_out = pd.DataFrame({
'p_thing': thing_prediction_list,
'p_property': property_prediction_list
})
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
df = pd.concat([df, df_out], axis=1)
# we can save the t5 generation output here
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
# here we want to evaluate mapping accuracy within the valid in mdm data only
in_mdm = df['MDM']
condition_correct_thing = df['p_thing'] == df['thing']
condition_correct_property = df['p_property'] == df['property']
prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm)
pred_correct_proportion = prediction_mdm_correct/sum(in_mdm)
# write output to file output.txt
with open("output.txt", "a") as f:
print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f)
###########################################
# Execute for all folds
# reset file before writing to it
with open("output.txt", "w") as f:
print('', file=f)
for fold in [1,2,3,4,5]:
infer_and_select(fold)

View File

@ -0,0 +1,227 @@
# %%
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from custom_t5.modeling_t5 import T5EncoderForSequenceClassification
from safetensors.torch import load_file
from transformers import (
T5Config,
T5TokenizerFast,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
EarlyStoppingCallback,
Seq2SeqTrainingArguments,
T5ForConditionalGeneration,
T5Model
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# outputs a list of dictionaries
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}{unit}",
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
}
output_list.append(element)
return output_list
def create_split_dataset(fold):
# train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
validation_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df)),
'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
})
return combined_data
# function to perform training for a given fold
def train(fold):
save_path = f'checkpoint_fold_{fold}b'
split_datasets = create_split_dataset(fold)
# prepare tokenizer
model_checkpoint = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['input']
target = example['output']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
text_target=target,
max_length=max_length,
truncation=True,
padding="max_length"
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns=split_datasets["train"].column_names,
)
# https://github.com/huggingface/transformers/pull/28414
# model_checkpoint = "google/t5-efficient-tiny"
# device_map set to auto to force it to load contiguous weights
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
directory = os.path.join(".", f'checkpoint_fold_{fold}a')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
# t5_classify = T5Model.from_pretrained(prev_checkpoint)
# Load the checkpoint
checkpoint_path = f"{prev_checkpoint}/model.safetensors"
checkpoint = load_file(checkpoint_path)
# Filter out weights related to the classification head
t5_weights = {key: value for key, value in checkpoint.items() if "classifier" not in key}
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
model.load_state_dict(state_dict=t5_weights, strict=False)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
# Freeze the encoder
for param in model.encoder.parameters():
param.requires_grad = False
# Freeze the shared embedding layer
for param in model.shared.parameters():
param.requires_grad = False
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")
def compute_metrics(eval_preds):
preds, labels = eval_preds
# In case the model returns more than the prediction logits
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds,
skip_special_tokens=False)
# Replace -100s in the labels as we can't decode them
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels,
skip_special_tokens=False)
# Remove <PAD> tokens from decoded predictions and labels
decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
# Some simple post-processing
# decoded_preds = [pred.strip() for pred in decoded_preds]
# decoded_labels = [[label.strip()] for label in decoded_labels]
# print(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {"bleu": result["score"]}
# Generation Config
# from transformers import GenerationConfig
gen_config = model.generation_config
gen_config.max_length = 128
# compile
# model = torch.compile(model, backend="inductor", dynamic=True)
# Trainer
args = Seq2SeqTrainingArguments(
f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,
generation_config=gen_config,
remove_unused_columns=False,
)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
for fold in [1,2,3,4,5]:
print(fold)
train(fold)

View File

@ -0,0 +1,228 @@
# %%
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from custom_t5.modeling_t5 import T5EncoderForSequenceClassification
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments,
T5Config,
)
import evaluate
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# we need to create the mdm_list
# import the full mdm-only file
data_path = '../../data_import/exports/data_mapping_mdm.csv'
full_df = pd.read_csv(data_path, skipinitialspace=True)
mdm_list = sorted(list((set(full_df['pattern']))))
# # rather than use pattern, we use the real thing and property
# thing_property = full_df['thing'] + full_df['property']
# thing_property = thing_property.to_list()
# mdm_list = sorted(list(set(thing_property)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(mdm_list):
id2label[idx] = val
label2id[val] = idx
# %%
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df, mdm_list):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
# pattern = f"{row['thing'] + row['property']}"
pattern = f"{row['thing_pattern'] + ' ' + row['property_pattern']}"
try:
index = mdm_list.index(pattern)
except ValueError:
print("Error: value not found in MDM list")
index = -1
element = {
'text' : f"{desc}{unit}",
'label': index,
}
output_list.append(element)
return output_list
def create_split_dataset(fold, mdm_list):
# train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
validation_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
})
return combined_data
# %%
# function to perform training for a given fold
def train(fold):
save_path = f'checkpoint_fold_{fold}a'
split_datasets = create_split_dataset(fold, mdm_list)
# prepare tokenizer
# model_checkpoint = "distilbert/distilbert-base-uncased"
# model_checkpoint = 'google-bert/bert-base-cased'
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
truncation=True,
padding="max_length"
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# %%
# compute metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
preds = np.argmax(preds, axis=1)
return metric.compute(predictions=preds, references=labels)
# %%
# create id2label and label2id
# %%
# model = AutoModelForSequenceClassification.from_pretrained(
# model_checkpoint,
# num_labels=len(mdm_list),
# id2label=id2label,
# label2id=label2id)
model = T5EncoderForSequenceClassification(
checkpoint=model_checkpoint,
tokenizer=tokenizer,
config=T5Config.from_pretrained(model_checkpoint),
num_labels=len(mdm_list)
)
# important! after extending tokens vocab
# model.t5.resize_token_embeddings(len(tokenizer))
# model = torch.compile(model, backend="inductor", dynamic=True)
# %%
# Trainer
training_args = TrainingArguments(
output_dir=f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=80,
bf16=True,
push_to_hub=False,
remove_unused_columns=False,
)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
for fold in [1,2,3,4,5]:
print(fold)
train(fold)
# %%