hipom_data_mapping/translation/bart/2.train.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# t5 training for combined concatenated outputs (thing + property) \n",
    "\n",
    "refer to `t5_train_tp.py` and `guide_for_tp.md` for faster training workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "90f850a9e8324109808e45e40f0eea47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6260 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "34e221d3425d414a9fb749a3ee28ad81",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/12969 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c5504c54cba4520aa34d5a6a078a31d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/2087 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='1800' max='3920' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1800/3920 13:48 < 16:16, 2.17 it/s, Epoch 36/80]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Bleu</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>200</td>\n",
       "      <td>2.654300</td>\n",
       "      <td>0.112380</td>\n",
       "      <td>26.397731</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>400</td>\n",
       "      <td>0.106600</td>\n",
       "      <td>0.035335</td>\n",
       "      <td>87.137364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>600</td>\n",
       "      <td>0.044600</td>\n",
       "      <td>0.022964</td>\n",
       "      <td>89.884682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>800</td>\n",
       "      <td>0.026300</td>\n",
       "      <td>0.018220</td>\n",
       "      <td>86.274312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1000</td>\n",
       "      <td>0.017300</td>\n",
       "      <td>0.016252</td>\n",
       "      <td>86.389477</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1200</td>\n",
       "      <td>0.012400</td>\n",
       "      <td>0.015651</td>\n",
       "      <td>94.416285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1400</td>\n",
       "      <td>0.011500</td>\n",
       "      <td>0.014833</td>\n",
       "      <td>91.596509</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1600</td>\n",
       "      <td>0.008800</td>\n",
       "      <td>0.015168</td>\n",
       "      <td>91.629519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1800</td>\n",
       "      <td>0.006900</td>\n",
       "      <td>0.015042</td>\n",
       "      <td>95.375351</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n",
      "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n",
      "Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}\n",
      "There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "from datasets import load_from_disk\n",
    "import json\n",
    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback\n",
    "import evaluate\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "model_name = \"facebook/bart-base\"\n",
    "train_epochs = 80\n",
    "\n",
    "# Load mode configuration\n",
    "with open(\"mode.json\", \"r\") as json_file:\n",
    "    mode_dict = json.load(json_file)\n",
    "\n",
    "mode_dict.update({\"model\": model_name, \"train_epochs\": train_epochs})\n",
    "fold_group = mode_dict.get(\"fold_group\")\n",
    "\n",
    "with open(\"mode.json\", \"w\") as json_file:\n",
    "    json.dump(mode_dict, json_file)\n",
    "\n",
    "mode = mode_dict.get(\"mode\", \"default_value\")\n",
    "file_path = f'combined_data/{mode}/{fold_group}'\n",
    "split_datasets = load_from_disk(file_path)\n",
    "\n",
    "# Load tokenizer and add special tokens\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "additional_special_tokens = [\n",
    "    \"<THING_START>\", \"<THING_END>\", \"<PROPERTY_START>\", \"<PROPERTY_END>\",\n",
    "    \"<TN_START>\", \"<TN_END>\", \"<TD_START>\", \"<TD_END>\", \n",
    "    \"<MIN_START>\", \"<MIN_END>\", \"<MAX_START>\", \"<MAX_END>\",\n",
    "    \"<UNIT_START>\", \"<UNIT_END>\"\n",
    "]\n",
    "tokenizer.add_special_tokens({\"additional_special_tokens\": additional_special_tokens})\n",
    "\n",
    "# Preprocess function for tokenization\n",
    "def preprocess_function(examples):\n",
    "    inputs = [ex[\"input\"] for ex in examples['translation']]\n",
    "    targets = [ex[\"thing_property\"] for ex in examples['translation']]\n",
    "    return tokenizer(inputs, text_target=targets, max_length=64, truncation=True)\n",
    "\n",
    "tokenized_datasets = split_datasets.map(\n",
    "    preprocess_function, batched=True, remove_columns=split_datasets[\"train\"].column_names\n",
    ")\n",
    "\n",
    "# Load model and resize token embeddings\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
    "model.resize_token_embeddings(len(tokenizer))\n",
    "\n",
    "# Data collator for padding and batching\n",
    "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
    "\n",
    "# Load evaluation metric\n",
    "metric = evaluate.load(\"sacrebleu\")\n",
    "\n",
    "# Compute metrics function\n",
    "def compute_metrics(eval_preds):\n",
    "    preds, labels = eval_preds\n",
    "    preds = preds[0] if isinstance(preds, tuple) else preds\n",
    "    \n",
    "    # Decode predictions and labels\n",
    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # Replace padding tokens\n",
    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
    "    \n",
    "    # Post-process decoding\n",
    "    decoded_preds = [pred.strip() for pred in decoded_preds]\n",
    "    decoded_labels = [[label.strip()] for label in decoded_labels]\n",
    "    \n",
    "    result = metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
    "    return {\"bleu\": result[\"score\"]}\n",
    "\n",
    "args = Seq2SeqTrainingArguments(\n",
    "    f\"train_{fold_group}_{model_name}_{mode}_{train_epochs}\",\n",
    "    save_strategy=\"steps\",\n",
    "    learning_rate=1e-5,\n",
    "    per_device_train_batch_size=32,\n",
    "    per_device_eval_batch_size=64,\n",
    "    auto_find_batch_size=True,\n",
    "    ddp_find_unused_parameters=False,\n",
    "    weight_decay=0.01,\n",
    "    save_total_limit=1,\n",
    "    num_train_epochs=train_epochs,\n",
    "    predict_with_generate=True,\n",
    "    bf16=True,\n",
    "    push_to_hub=False,\n",
    "    evaluation_strategy=\"steps\",\n",
    "    eval_steps=200,\n",
    "    save_steps=200,    \n",
    "    logging_steps=200,  \n",
    "    load_best_model_at_end=True, \n",
    "    lr_scheduler_type=\"linear\",\n",
    "    warmup_steps=100,\n",
    ")\n",
    "\n",
    "# Define the EarlyStoppingCallback\n",
    "early_stopping_callback = EarlyStoppingCallback(\n",
    "   early_stopping_patience=2\n",
    ")\n",
    "\n",
    "trainer = Seq2SeqTrainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics,\n",
    "    callbacks=[early_stopping_callback]  \n",
    ")\n",
    "\n",
    "trainer.train()\n",
    "os._exit(0)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}