# t5 training for combined concatenated outputs (thing + property) 

refer to `t5_train_tp.py` and `guide_for_tp.md` for faster training workflow

In [1]:
# import data and load dataset
from datasets import load_from_disk
import json
from transformers import AutoTokenizer

model_name = "t5-base"
train_epochs = 80


# Read the mode from the JSON file
with open("mode.json", "r") as json_file:
    mode_dict = json.load(json_file)

# Add the model key to the dictionary
mode_dict["model"] = model_name
mode_dict["train_epochs"] = train_epochs

# Access the fold_group value
fold_group = mode_dict.get("fold_group")

# Save the updated dictionary back to the JSON file
with open("mode.json", "w") as json_file:
    json.dump(mode_dict, json_file)

# Set the mode variable from the JSON content
mode = mode_dict.get("mode", "default_value")  # 'default_value' is a fallback if 'mode' is not found

print(f"The mode has been set to: {mode}")

# Path to saved combined_dataset
file_path = f'combined_data/{mode}/{fold_group}'
split_datasets = load_from_disk(file_path)


    
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Define additional special tokens
# additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>"]
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<TN_START>", "<TN_END>", "<TD_START>", "<TD_END>", "<MIN_START>", "<MIN_END>", "<MAX_START>", "<MAX_END>", "<UNIT_START>", "<UNIT_END>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})

max_length = 64

def preprocess_function(examples):
    inputs = [ex["input"] for ex in examples['translation']]
    targets = [ex["thing_property"] for ex in examples['translation']]
    # text_target sets the corresponding label to inputs
    # there is no need to create a separate 'labels'
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

# map method maps preprocess_function to [train, valid, test] datasets of the datasetDict
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

The mode has been set to: tn_td_unit


Map:   0%|          | 0/6125 [00:00<?, ? examples/s]

Map:   0%|          | 0/14719 [00:00<?, ? examples/s]

Map:   0%|          | 0/2042 [00:00<?, ? examples/s]

In [2]:
import torch
import os
import json

# we use the pre-trained t5-base model
from transformers import AutoModelForSeq2SeqLM
model_checkpoint = model_name
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# data collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# evaluation 
import evaluate
metric = evaluate.load("sacrebleu")
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

from transformers import Seq2SeqTrainingArguments

# load environment variables to disable GPU p2p mode for multi-gpu training without p2p mode
# not required for single-gpu training
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'

args = Seq2SeqTrainingArguments(
    f"train_{fold_group}_{model_name}_{mode}_{train_epochs}",
    evaluation_strategy="no",
    # logging_dir="tensorboard-log",
    # logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    auto_find_batch_size=True,
    ddp_find_unused_parameters=False,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=train_epochs,
    predict_with_generate=True,
    bf16=True,
    push_to_hub=False,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()



Step,Training Loss
500,2.8123
1000,0.6993
1500,0.4409
2000,0.3321
2500,0.2765
3000,0.2459
3500,0.2293




TrainOutput(global_step=3840, training_loss=0.6754856963952383, metrics={'train_runtime': 2559.4201, 'train_samples_per_second': 191.45, 'train_steps_per_second': 1.5, 'total_flos': 3.156037495934976e+16, 'train_loss': 0.6754856963952383, 'epoch': 80.0})