193 lines
7.4 KiB
Python
193 lines
7.4 KiB
Python
# %%
|
|
# Prepare dataloader for jax training
|
|
from datasets import Dataset, DatasetDict, Value, Sequence, load_from_disk
|
|
from transformers import FlaxT5ForConditionalGeneration
|
|
from datasets import ClassLabel, Value, Sequence
|
|
from ml_collections import ConfigDict
|
|
import numpy as np
|
|
import jax.numpy as jnp
|
|
import jax
|
|
import math
|
|
from typing import Optional, List, Tuple, Callable, cast
|
|
|
|
|
|
file_path = '/home/richard/Projects/learn_t5/simple_model/combined_data_t5_retrieval'
|
|
# file_path = 'combined_data'
|
|
# split_datasets = load_from_disk(file_path)
|
|
# training_size = len(split_datasets['train'])
|
|
|
|
from transformers import T5TokenizerFast
|
|
|
|
# class takes in a dataset
|
|
class DataPrepare():
|
|
|
|
def __init__(self, raw_dataset, config):
|
|
self.raw_dataset: Dataset = raw_dataset
|
|
self.train_dataset: Optional[Dataset] = None
|
|
self.size: int = len(raw_dataset)
|
|
self.config: ConfigDict = config
|
|
self.tokenizer = T5TokenizerFast.from_pretrained("t5-base", return_tensors="np", clean_up_tokenization_spaces=False)
|
|
# Define additional special tokens
|
|
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
|
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
|
|
# Add the additional special tokens to the tokenizer
|
|
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
|
|
|
model = FlaxT5ForConditionalGeneration.from_pretrained("t5-base")
|
|
|
|
model_module = __import__(model.__module__, fromlist=["shift_tokens_tight"])
|
|
self.shift_tokens_right_fn = getattr(model_module, "shift_tokens_right") # noqa: B009
|
|
|
|
|
|
|
|
self.make_dataset()
|
|
|
|
# In Flax, for seq2seq models we need to pass `decoder_input_ids`
|
|
# as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here
|
|
# for that dynamically import the `shift_tokens_right` function from the model file
|
|
|
|
# given a dataset entry, run it through the tokenizer
|
|
# Setting padding="max_length" as we need fixed length inputs for jitted functions
|
|
def preprocess_function(self, example: Dataset):
|
|
inputs = example['input']
|
|
targets = example['output']
|
|
# text_target sets the corresponding label to inputs
|
|
# there is no need to create a separate 'labels'
|
|
# produce input_ids and decoder_input_ids
|
|
model_inputs = self.tokenizer(
|
|
inputs,
|
|
max_length=self.config.max_length,
|
|
padding="max_length",
|
|
truncation=True,
|
|
return_tensors="np"
|
|
)
|
|
# we separate it out because we need the attention mask
|
|
labels = self.tokenizer(
|
|
text_target=targets,
|
|
max_length=self.config.max_length,
|
|
padding="max_length",
|
|
truncation=True,
|
|
return_tensors="np"
|
|
)
|
|
model_inputs['input_ids'] = np.asarray(model_inputs['input_ids'])
|
|
model_inputs['attention_mask'] = np.asarray(model_inputs['attention_mask'])
|
|
# for loss computation
|
|
model_inputs["labels"] = labels["input_ids"]
|
|
# make decoder input ids
|
|
# this is actually "model output" shifted right
|
|
decoder_input_ids = self.shift_tokens_right_fn(
|
|
labels["input_ids"], self.config.pad_token_id, self.config.decoder_start_token_id
|
|
)
|
|
# require by model
|
|
model_inputs["decoder_input_ids"] = np.asarray(decoder_input_ids)
|
|
# decoder_attention_mask = shift_tokens_right_fn(
|
|
# labels["attention_mask"], self.config.pad_token_id, self.config.decoder_start_token_id
|
|
# )
|
|
# We need decoder_attention_mask so we can ignore pad tokens in loss
|
|
model_inputs["decoder_attention_mask"] = np.asarray(labels["attention_mask"])
|
|
|
|
return model_inputs
|
|
|
|
def make_dataset(self):
|
|
train_dataset = self.raw_dataset.map(
|
|
self.preprocess_function,
|
|
batched=True,
|
|
num_proc=1,
|
|
# if we do not remove, we keep the original data
|
|
remove_columns=self.raw_dataset.column_names,)
|
|
|
|
# set to numpy
|
|
# train_dataset.set_format(
|
|
# type='numpy',
|
|
# columns=[
|
|
# 'input_ids', 'attention_mask', 'labels',
|
|
# 'decoder_input_ids',
|
|
# 'decoder_attention_mask']
|
|
# )
|
|
|
|
# check that data fits
|
|
# for name in ['input_ids', 'attention_mask', 'labels', 'decoder_input_ids', 'decoder_attention_mask']:
|
|
# int_array: np.array = train_dataset[name]
|
|
# if np.all((int_array >= 0) & (int_array <= 65535)):
|
|
# continue
|
|
# else:
|
|
# raise ValueError("Values are out of range for uint16")
|
|
|
|
# change to compact datatypes
|
|
# features = train_dataset.features.copy()
|
|
# features['input_ids'] = Sequence(Value('uint16'))
|
|
# features['attention_mask'] = Sequence(Value('uint16'))
|
|
# features['labels'] = Sequence(Value('uint16'))
|
|
# features['decoder_input_ids'] = Sequence(Value('uint16'))
|
|
# features['decoder_attention_mask'] = Sequence(Value('uint16'))
|
|
# train_dataset = train_dataset.cast(features)
|
|
# assign the dataset to train_dataset
|
|
self.train_dataset = train_dataset
|
|
|
|
def data_loader(self, rng: jax.random.PRNGKey, batch_size: int, shuffle: bool = False, drop_last=True):
|
|
"""
|
|
Returns batches of size `batch_size` from `dataset`. If `drop_last` is set to `False`, the final batch may be incomplete,
|
|
and range in size from 1 to `batch_size`. Shuffle batches if `shuffle` is `True`.
|
|
"""
|
|
assert(self.train_dataset is not None)
|
|
dataset: Dataset = cast(Dataset, self.train_dataset)
|
|
|
|
if shuffle:
|
|
batch_idx = jax.random.permutation(rng, len(dataset))
|
|
batch_idx = np.asarray(batch_idx)
|
|
else:
|
|
batch_idx = np.arange(len(dataset))
|
|
|
|
if drop_last:
|
|
steps_per_epoch = len(dataset) // batch_size
|
|
batch_idx = batch_idx[: steps_per_epoch * batch_size] # Skip incomplete batch.
|
|
batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
|
|
else:
|
|
steps_per_epoch = math.ceil(len(dataset) / batch_size)
|
|
batch_idx = np.array_split(batch_idx, steps_per_epoch)
|
|
|
|
for idx in batch_idx:
|
|
batch = dataset[idx]
|
|
batch = {k: np.array(v) for k, v in batch.items()}
|
|
|
|
yield batch
|
|
|
|
|
|
# # testing out the class
|
|
# # %%
|
|
# # init object
|
|
# # e.g. Config
|
|
# data_config = ConfigDict(
|
|
# dict(
|
|
# max_length=86,
|
|
# pad_token_id=0,
|
|
# decoder_start_token_id=0
|
|
# )
|
|
# )
|
|
#
|
|
# from datasets import load_from_disk
|
|
# split_datasets = load_from_disk(file_path)
|
|
# dataprep = DataPrepare(split_datasets['train'], data_config)
|
|
#
|
|
# # %%
|
|
# seed = 117
|
|
# rng = jax.random.PRNGKey(seed)
|
|
# train_loader = dataprep.data_loader(rng, batch_size=32)
|
|
#
|
|
#
|
|
#
|
|
# # %%
|
|
# batch = next(iter(train_loader))
|
|
# batch['input_ids'].shape
|
|
# # %%
|
|
#
|
|
# sentence = "<THING_START><THING_END><PROPERTY_START><PROPERTY_END><NAME><DESC><DESC><UNIT>"
|
|
# tokens = tokenizer.tokenize(sentence)
|
|
# print("Tokens:", tokens)
|
|
# # Get the IDs (integer indices) of specific tokens
|
|
# token_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
|
|
# print("Token IDs:", token_ids)
|
|
#
|
|
#
|
|
# # %%
|