From 3d2266cf65d26221ec6e007bd6ab5b7325d852cb Mon Sep 17 00:00:00 2001
From: hhs0625 <hhs0625@gmail.com>
Date: Mon, 26 Aug 2024 19:51:11 +0900
Subject: [PATCH] [TASK] init

---
 data_import/1.select_db.py                    |  59 +++
 data_import/2.make_csv.py                     |  38 ++
 data_import/plot_count.ipynb                  |  88 ++++
 .../no_preprocess/copy_raw_data.py            |   9 +
 .../1.add_tag_name.ipynb                      | 133 +++++
 .../2.seperate_number.ipynb                   | 100 ++++
 .../rule_base_replacement/3.replacement.ipynb | 123 +++++
 data_preprocess/split_data.ipynb              | 441 ++++++++++++++++
 evaluation/check_accuracy.ipynb               |  97 ++++
 .../tfidf_class/1.make_sdl_class_document.py  |  47 ++
 .../tfidf_class/2.classify_by_tfidf.ipynb     | 134 +++++
 post_process/tfidf_class/3.refine.ipynb       | 144 ++++++
 .../tfidf_class/4.selection_by_tfidf.py       | 114 +++++
 translation/t5/1.data_process_concat.ipynb    | 198 ++++++++
 translation/t5/2.t5_train.ipynb               | 477 ++++++++++++++++++
 .../t5/3.produce_test_predictions.ipynb       | 447 ++++++++++++++++
 16 files changed, 2649 insertions(+)
 create mode 100644 data_import/1.select_db.py
 create mode 100644 data_import/2.make_csv.py
 create mode 100644 data_import/plot_count.ipynb
 create mode 100644 data_preprocess/no_preprocess/copy_raw_data.py
 create mode 100644 data_preprocess/rule_base_replacement/1.add_tag_name.ipynb
 create mode 100644 data_preprocess/rule_base_replacement/2.seperate_number.ipynb
 create mode 100644 data_preprocess/rule_base_replacement/3.replacement.ipynb
 create mode 100644 data_preprocess/split_data.ipynb
 create mode 100644 evaluation/check_accuracy.ipynb
 create mode 100644 post_process/tfidf_class/1.make_sdl_class_document.py
 create mode 100644 post_process/tfidf_class/2.classify_by_tfidf.ipynb
 create mode 100644 post_process/tfidf_class/3.refine.ipynb
 create mode 100644 post_process/tfidf_class/4.selection_by_tfidf.py
 create mode 100644 translation/t5/1.data_process_concat.ipynb
 create mode 100644 translation/t5/2.t5_train.ipynb
 create mode 100644 translation/t5/3.produce_test_predictions.ipynb

diff --git a/data_import/1.select_db.py b/data_import/1.select_db.py
new file mode 100644
index 0000000..eefac28
--- /dev/null
+++ b/data_import/1.select_db.py
@@ -0,0 +1,59 @@
+import psycopg2
+import pandas as pd
+
+# Function to read the db connection info
+def read_db_connection_info(filename="db_connection_info.txt"):
+    connection_info = {}
+    try:
+        with open(filename, 'r') as file:
+            for line in file:
+                key, value = line.strip().split('=')
+                connection_info[key] = value
+    except Exception as e:
+        print(f"Failed to read database connection info: {e}")
+        raise
+    return connection_info
+
+# Load the connection info
+connection_info = read_db_connection_info()
+
+try:
+    # Connect to the database
+    conn = psycopg2.connect(
+        host=connection_info["host"],
+        user=connection_info["user"],
+        password=connection_info["password"],
+        dbname=connection_info["database"],
+        port=connection_info["port"]
+    )
+    # This ensures that resources are cleaned up properly
+    with conn:
+        with conn.cursor() as cursor:
+            # Export data_mapping table
+            query_mapping = """
+                SELECT * FROM data_mapping
+                WHERE ships_idx BETWEEN 1000 AND 1999
+            """
+            cursor.execute(query_mapping)
+            results_mapping = cursor.fetchall()
+            columns_mapping = [desc[0] for desc in cursor.description]
+            df_mapping = pd.DataFrame(results_mapping, columns=columns_mapping)
+            df_mapping.to_csv('data_import/data_mapping.csv', index=False, encoding='utf-8-sig')
+            
+            # Export data_master_model table
+            query_master = """
+                SELECT * FROM data_model_master
+            """
+            cursor.execute(query_master)
+            results_master = cursor.fetchall()
+            columns_master = [desc[0] for desc in cursor.description]
+            df_master = pd.DataFrame(results_master, columns=columns_master)
+            df_master.to_csv('data_import/data_model_master_export.csv', index=False, encoding='utf-8-sig')
+
+    print("Data exported successfully to 'data_import/data_mapping.csv' and 'data_import/data_model_master_export.csv'")
+
+except (Exception, psycopg2.DatabaseError) as error:
+    print(f"An error occurred: {error}")
+finally:
+    if conn is not None:
+        conn.close()
diff --git a/data_import/2.make_csv.py b/data_import/2.make_csv.py
new file mode 100644
index 0000000..1ce8787
--- /dev/null
+++ b/data_import/2.make_csv.py
@@ -0,0 +1,38 @@
+import pandas as pd
+import re
+
+# Load the data_mapping CSV file
+data_mapping_file_path = 'data_import/data_mapping.csv'  # Adjust this path to your actual file location
+data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
+df_master = pd.read_csv('data_import/data_model_master_export.csv')
+
+# Generate patterns
+data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)
+data_mapping['property_pattern'] = data_mapping['property'].str.replace(r'\d', '#', regex=True)
+data_mapping['pattern'] = data_mapping['thing_pattern'] + " " + data_mapping['property_pattern'] 
+df_master['master_pattern'] = df_master['thing'] + " " + df_master['property']
+
+# Create a set of unique patterns from master for fast lookup
+master_patterns = set(df_master['master_pattern'])
+
+# Check each pattern in data_mapping if it exists in df_master and assign the "MDM" field
+data_mapping['MDM'] = data_mapping['pattern'].apply(lambda x: x in master_patterns)
+
+# Remove specified fields
+fields_to_remove = ['equip_type_code', 'tx_period', 'tx_type', 'on_change_yn', 'scaling_const', 'description', 'updated_time', 'status_code', 'is_timeout']
+merged_data = data_mapping.drop(columns=fields_to_remove)
+
+# Save the updated DataFrame to a new CSV file
+output_file_path = 'data_import/raw_data.csv'
+merged_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')
+
+print(f"Updated data saved to {output_file_path}")
+
+# Filter the DataFrame where MDM is TRUE
+data_mapping_mdm_true = merged_data[merged_data['MDM']]
+
+# Save the filtered DataFrame to a new CSV file
+mdm_true_output_file_path = 'data_import/data_mapping_mdm.csv'
+data_mapping_mdm_true.to_csv(mdm_true_output_file_path, index=False, encoding='utf-8-sig')
+
+print(f"MDM TRUE data saved to {mdm_true_output_file_path}")
diff --git a/data_import/plot_count.ipynb b/data_import/plot_count.ipynb
new file mode 100644
index 0000000..029be47
--- /dev/null
+++ b/data_import/plot_count.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA28AAAK7CAYAAABlF7dxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABzZ0lEQVR4nO3deXhU9dn/8c9MMtkTlmHHKDEIKFtRxCrQQgUUl4qIj1aRzYXa6k9ksaI+MFiVIrg8jcuDSw0qojXoo0FcYhQ3FEGwKgpoCBGCgAFCEgJhkjm/P2imiZmsM2dOTvJ+XVeu6+Sc73IPc3OSO9+Z7zgMwzAEAAAAAGjWnFYHAAAAAACoH8UbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANhApNUB2J3P59Pu3buVmJgoh8NhdTgAAAAALGIYhoqLi9WtWzc5naFfJ6N4C9Lu3buVnJxsdRgAAAAAmomdO3fqhBNOCPm4FG9BSkxMlCTl5uaqffv2FkeDlsTr9eqdd97RmDFj5HK5rA4HLQz5BbOQWzAT+QWzhCq3ioqKlJyc7K8RQo3iLUiVL5VMTExUUlKSxdGgJfF6vYqLi1NSUhI/oBBy5BfMQm7BTOQXzBLq3DLr7VQUbyEyZUKO5CsI65yZaSvCOl+LMMBjdQQAAABAk7DbJAAAAADYAMUbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANiArXabTE9P19SpU+ttl5WVpVGjRgW8lpOTo/vvv1/vvPOOfvrpJyUmJur000/XDTfcoMsuu6zpsWWkyu12N7l/03jCPB8AAAAAq9iqeKvkdDrVsWPHWq9HR0cHPL969WpdfvnlKi0tlSQlJSXpwIEDeuedd/TOO+9o6tSpevrpp037XAYAAAAAaCpbvmwyOTlZe/bsqfVr+PDhNfrk5ubqv/7rv1RaWqqhQ4dq69atOnTokA4dOqR58+ZJkp555hktXrw43A8HAAAAAOply+KtKebNm6fDhw+rS5cuWrVqlXr16iVJSkhI0IIFC3TDDTdIku69914dPHjQylABAAAAoIZWUbwdPnxYK1eulCTdeOONatu2bY02c+fOlSQVFRXp//7v/8IYHQAAAADUr1UUbx9//LGOHDkiSRo7dmzANj169NCpp54qSXrnnXfCFhsAAAAANIQtNyz5+eefdcYZZ2jr1q2qqKhQ165ddc455+i6667TiBEjarT/5ptv/Mf9+vWrddx+/frpu+++0+bNmxsd05QJOZKvoNH9GiIzbYUp49raAI/VEQAAAABhZcvirbS0VBs3blS7du10+PBh5ebmKjc3V8uXL9fUqVP1xBNPKDLyPw9t9+7dkqR27dopNja21nG7d+9erX0gZWVlKisr839fVFQkSXJFVkiqCOZh1crraxULpI3j9Vodgem8/36M3lbwWBF+5BfMQm7BTOQXzBKq3DI7N21VvHXr1k3z58/X+PHj1bt3b0VHR6uiokLr1q3T/Pnz9e677+qZZ55RfHy80tLS/P2Ki4slSXFxcXWOX3m9sn0gCxcu1IIFC2qcv3zajnrHb6rVeQNNGdfW8lZbHUHYZGVlWR0CWjDyC2Yht2Am8gtmCTa3Kj+SzCwOwzAMU2cIE5/Pp/Hjx+u1116T0+nUli1bdMopp0iSbrjhBj355JPq3r27du3aVesYd955p+677z5FRUVVW12rKtDKW3Jysi4dtVZSm5A+pkovLc4wZVxb6zvX6ghM5/V6lZWVpdGjR8vlclkdDloY8gtmIbdgJvILZglVbhUVFalDhw46dOiQkpKSQhjhcbZaeauL0+nUkiVL9Nprr8nn8ykzM1MzZ86UJCUmJkqqvxKuvF7ZPpDo6OiAHwLuLY+QfBFNDb9OLqfPlHFtrRXdsF0uFz+gYBryC2Yht2Am8gtmCTa3zM7LFlO8SVLPnj3VoUMHFRQUaPv27f7z3bp1kyQdPHhQR44cqfV9b/n5+dXaN0Z6RqrcbncTom4Ij0njAgAAALCLVrETRtUdJqvuPPlLldf69u1rekwAAAAA0BgtqnjLyclRQcHx7fpTUlL854cNG+ZfbXvrrbcC9s3Ly9N3330nSRozZozJkQIAAABA49imeKtvXxXDMDRnzhxJx9//dtFFF/mvxcfH67LLLpMkPf744zp06FCN/osWLZJ0/P1u48aNC1HUAAAAABAatine8vLyNGTIEC1dulTbt2/3F3M+n0+fffaZxo4dq1dffVWSNH36dPXu3bta/7vvvlvx8fH66aefdPHFF+v777+XJB0+fFh33323/vd//1eSdNddd6ldu3ZhfGQAAAAAUD9bbViyfv16rV+/XtLxXR8TExNVXFxcbev+qVOn6u9//3uNvikpKfrnP/+pyy+/XB999JF69eqlNm3aqKSkRBUVFf6+lat3AAAAANCc2KZ469y5s9LS0vTpp5/qyy+/1M8//6yDBw8qJiZGKSkpOuecczRt2jQNHTq01jEuuOACffXVV1q0aJGysrL0008/qV27dho0aJCmT5/uf2llU0yZkCP5Cprcv7Ey01aEbS5YxOeUNFDavFDi4yIQauQXzEJuwUzkF35pgMfqCMLKNsVbbGysbrrpJt10001BjZOamqonnngiRFEBAAAAQHjY5j1vAAAAANCaUbwBAAAAgA1QvAEAAACADVC8AQAAAIAN2GbDkuYuPSNVbrc7jDN6wjgXLOH1Snmrpb5zJZfL6mjQ0pBfMAu5BTORX2jlWHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2C3yRCZMiFH8hVYMndm2gpL5oXJfE5JA6XNCyWnz+poUJcBHqsjAAAArQArbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGAD7DYZIukZqXK73RbN7rFoXpjK65XyVkt950oul9XRAAAAwGKsvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADVC8AQAAAIANsNtkiEyZkCP5CqwOo5rMtBVWhyAN8FgdAQAAANAisPIGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADZA8QYAAAAANsBukyGSnpEqt9ttdRi/4LE6AAAAAAAhwsobAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAxRsAAAAA2AC7TYbIlAk5kq8gbPNlpq0I21ymG+CxOgIAAACg2WPlDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGzA9rtN/u1vf9PcuXP93xuGUaNNenq6pk6dWu9YWVlZGjVqVJPiSM9IldvtblLfpvGEcS4AAAAAVrN18bZ161YtWLCgwe2dTqc6duxY6/Xo6OhQhAUAAAAAIWfb4s3n82natGk6evSozj77bH366af19klOTtaOHTvMDw4AAAAAQsy273lLS0vT2rVrdfXVV2vMmDFWhwMAAAAAprJl8Zabm6s777xTbrdbDz30kNXhAAAAAIDpbPmyyeuvv16HDx/WY489Vud72AAAAACgpbBd8fbkk08qOztbo0aN0qRJkxrV9+eff9YZZ5yhrVu3qqKiQl27dtU555yj6667TiNGjAgqrikTciRfQVBjNEVm2oqwz4kw8TklDZQ2L5ScPqujabkGeKyOAAAAoEFs9bLJ/Px8zZkzR7GxsVq6dGmj+5eWlmrjxo2KioqSz+dTbm6uli9frpEjR2ratGkqLy83IWoAAAAACJ6tVt6mT5+uQ4cOadGiRTr55JMb3K9bt26aP3++xo8fr969eys6OloVFRVat26d5s+fr3fffVfPPPOM4uPjlZaWVudYZWVlKisr839fVFQkSXJFVkiqaNLjCobXZ6v6G41Q+dzyHJvM67U6Akt4//24va308cM85BbMRH7BLKHKLbNz02EE+lTrZuj555/XNddco1/96ldav369IiP/U3d6PB7/57019uH4fD6NHz9er732mpxOp7Zs2aJTTjml1vZV56rqhRdeUFxcXKPmBgAAANBylJaW6qqrrtKhQ4eUlJQU8vFtUbzt3btXffv2VWFhoT777DMNHjy42vVgijdJ+uGHH/wF2wMPPKCZM2fW2jbQyltycrIuHbVWUptGzx2slxZnhH1OhIfX51TWzv4anfy1XLznzTx951odgSW8Xq+ysrI0evRouVwuq8NBC0JuwUzkF8wSqtwqKipShw4dTCvebPGyydtvv1379+/XjTfeqD59+qikpKTa9WPHjvmPK69FRUUpKiqqQeP37NlTHTp0UEFBgbZv315n2+joaEVHR9c47y2PkHwRDZovlPilvuVzOX08z2Zq5T/8XS4XvwDBFOQWzER+wSzB5pbZeWmL4i03N1eS9Pjjj+vxxx+vs21iYqIk6ZZbbtHDDz9sdmh+6RmpcrvdYZvvPzwWzImw8HqlvNXHV4b4AQUAANDqsROCpJycHBUUHN/mPyUlxeJoAAAAAKAmWxRva9askWEYtX7Nnz/f37byXOWqW33vgTMMQ3PmzJEkOZ1OXXTRRaY9DgAAAABoKlsUb8HIy8vTkCFDtHTpUm3fvt1fzPl8Pn322WcaO3asXn31VUnHP4qgd+/eVoYLAAAAAAHZ4j1vwVq/fr3Wr18v6fiGI4mJiSouLq62a+TUqVP197//3aoQAQAAAKBOLb5469y5s9LS0vTpp5/qyy+/1M8//6yDBw8qJiZGKSkpOuecczRt2jQNHTrU6lABAAAAoFYtonjzeDzyeDwBr8XGxuqmm27STTfdZGoMUybkSL4CU+eoS2baCsvmhkl8TkkDpc0LJT4qAKHWmPwa4AlHRAAAoB4t/j1vAAAAANASULwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA20iN0mm4P0jFS53W4LI/BYODdM4fVKeaulvnMll8vqaNDSkF8AANgOK28AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYANsWBIiUybkSL4CU+fITFth6vhoZnxOSQOlzQslp8/qaNDSkF9oqgEeqyMAgFaLlTcAAAAAsAGKNwAAAACwAYo3AAAAALABijcAAAAAsAGKNwAAAACwAXabDJH0jFS53W6TZ/GYPD6aFa9Xylst9Z0ruVxWR4OWhvwCAMB2WHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2C3yRCZMiFH8hWEbb7MtBVhmwsW8TklDZQ2L5ScPquj+Y8BHqsjAAAAaJVYeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAbYLfJEEnPSJXb7Q7jjJ4wzgVLeL1S3mqp71zJ5bI6GgAAAFiMlTcAAAAAsAGKNwAAAACwAYo3AAAAALABijcAAAAAsAGKNwAAAACwAXabDJEpE3IkX4EpY2emrWhYwwEeU+YHAAAAYD1W3gAAAADABijeAAAAAMAGKN4AAAAAwAYo3gAAAADABijeAAAAAMAG2G0yRNIzUuV2u00a3WPSuAAAAADswvYrb3/729/kcDj8X3UpLi6Wx+NR//79lZCQoDZt2ujMM8/UAw88oGPHjoUpYgAAAABoPFuvvG3dulULFixoUNu8vDyNGDFCO3bskCTFxcWprKxMGzZs0IYNG7R8+XJlZ2erXbt2JkYMAAAAAE1j25U3n8+nadOm6ejRozr77LPrbFteXq6LL75YO3bsUNeuXZWVlaXDhw+rtLRUL774ohITE7Vp0yZNnDgxTNEDAAAAQOPYtnhLS0vT2rVrdfXVV2vMmDF1tl22bJm+/vprSdLKlSs1atQoSZLT6dQVV1yhpUuXSpJWr16t7OxscwMHAAAAgCawZfGWm5urO++8U263Ww899FC97ZctWyZJGjlyZMBVuiuvvFIpKSmSpGeffTa0wQIAAABACNjyPW/XX3+9Dh8+rMcee0wdO3ass21paak++eQTSdLYsWMDtnE4HDr//PP1+OOP65133mlSTFMm5Ei+gib1DSQzbUXIxmqQAZ7wzgcAAACgUWy38vbkk08qOztbo0aN0qRJk+pt/91338nn80mS+vXrV2u7ymt79uzRgQMHQhMsAAAAAISIrVbe8vPzNWfOHMXGxvrfp1af3bt3+4+7d+9ea7uq13bv3q327dsHbFdWVqaysjL/90VFRZIkV2SFpIoGxdQQXl+Y62qvN7zzoV7efz8nXp4bmID8glnILZiJ/IJZQpVbZuemrYq36dOn69ChQ1q0aJFOPvnkBvUpLi72H8fFxdXaruq1qn1+aeHChQE/nuDyaTvqHL+xVucNDNlYDZK3OrzzocGysrKsDgEtGPkFs5BbMBP5BbMEm1ulpaUhiiQw2xRvzz//vN544w396le/0syZMy2LY+7cudXmLyoqUnJysl7+Rw9JbUI2z0uLM0I2VoP0nRve+VAvr9errKwsjR49Wi6Xy+pw0MKQXzALuQUzkV8wS6hyq/JVeWaxRfG2d+9ezZgxQxEREXryyScVGdnwsBMTE/3HdVXCVa9V7fNL0dHRio6OrnHeWx4h+SIaHFd9XE5fyMZq2ITcAJsrl8vFDyiYhvyCWcgtmIn8glmCzS2z89IWxdvtt9+u/fv368Ybb1SfPn1UUlJS7fqxY8f8x5XXoqKiFBUVpW7duvmv5efna8CAAQHnyM/P9x9X7dNQ6Rmpcrvdje5XO08IxwIAAABgd7bYbTI3N1eS9PjjjysxMbHG18KFC/1tK8/ddtttkqRTTz1VTufxh/nNN9/UOkfltS5dutS6WQkAAAAAWMUWxVsw4uLiNHToUEnSW2+9FbCNYRh6++23JUljxowJW2wAAAAA0FC2KN7WrFkjwzBq/Zo/f76/beW5hx9+2H9u8uTJkqT3339f69atqzH+yy+/rO3bt0tSgz47DgAAAADCzRbFW7AmT56s/v37yzAMXXbZZcrOzpYk+Xw+vfzyy7r++uslSWPHjtW5555rZagAAAAAEJAtNiwJVmRkpF5//XWNHDlSO3bs0KhRoxQXFyefz6ejR49KkgYNGqTly5dbHCkAAAAABNYqijdJ6tGjh7766istWbJEr7zyinJzc+VyudS3b1/94Q9/0M0336yoqKgmjz9lQo7kKwhhxPXLTFsR1vkQZj6npIHS5oVSuD86Ai0f+VXTAI/VEQAAUKcW8bJJj8fjf69bXRITE7VgwQJ9/fXXKikpUVFRkTZs2KBZs2YFVbgBAAAAgNlaRPEGAAAAAC0dxRsAAAAA2ADFGwAAAADYAMUbAAAAANhAq9lt0mzpGalyu91hntUT5vkQVl6vlLda6jtXcrmsjgYtDfkFAIDtsPIGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADZA8QYAAAAANsBukyEyZUKO5CsI+biZaStCPmZIDPBYHQEAAADQqrDyBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADbAbpMhkp6RKrfbbcLIHhPGBAAAAGA3rLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIhMmZAj+QpMnSMzbUX9jQZ4TI0BAAAAgDVYeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2DDkhBJz0iV2+02eRaPyeMDAAAAaK5YeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAbYLfJEJkyIUfyFYRlrsy0FcENMMATkjgAAAAAhA8rbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGAD7DYZIukZqXK73WGazROmeQAAAAA0F7Zaedu4caMWLFig3//+9+rTp4/cbrdcLpfcbreGDh2qe++9VwcOHAjY1+PxyOFw1Pv1ww8/hPlRAQAAAED9bLXy9o9//EOPPvqo//uYmBjFxsbqwIEDWrt2rdauXauHH35Yr7/+us4+++yAY7hcLrVv377WOSIjbfVPAgAAAKCVsFWlMmTIEPXo0UPDhg1Tnz591LZtW0lSSUmJXnnlFc2ePVs///yzxo0bp23btqlNmzY1xjjnnHO0Zs2a8AYOAAAAAEGyVfE2adKkgOcTEhI0adIkdenSReedd5727dunVatW6eqrrw5zhAAAAABgDlu9560+v/71r/3Hu3btsjASAAAAAAgtW6281eejjz7yH6empoZ17ikTciRfgalzZKatCH6QAZ7gxwAAAAAQdrZfeSsrK9OOHTv0yCOP6JprrpEk9ezZUxdffHHA9ps3b1a/fv0UFxenhIQE9e7dW9dff702bdoUzrABAAAAoFFsu/IWExOjsrKyGueHDh2qF154QdHR0QH7FRQU6MCBA2rbtq2Kioq0bds2bdu2TU8//bTuuOMO3XPPPXXOW1ZWVm3eoqIiSZIrskJSRdMfUAN4fSGotb3e4MdAWHj//Vx5ec5gAvILZiG3YCbyC2YJVW6ZnZsOwzAMU2cwSY8ePXT06FGVlJTo8OHDkqSRI0fq/vvv1+DBg2u0X758uXbv3q1LLrlEKSkpcrlcOnbsmNasWaM77rhDX3zxhSRpyZIlmjVrVq3zejweLViwoMb5F154QXFxcSF6dAAAAADsprS0VFdddZUOHTqkpKSkkI9v2+Ktqn379um5557Tvffeq8LCQt111126++67G9z/6NGj+s1vfqP169crISFBu3btCvgxA1Lglbfk5GRdOmqtpMB9QuWlxRnBD9J3bvBjICy8Xq+ysrI0evRouVwuq8NBC0N+wSzkFsxEfsEsocqtoqIidejQwbTizbYvm6yqU6dOmjVrloYPH66zzz5bf/3rXzVkyBBddNFFDeofExOj++67T6NHj1ZJSYmys7M1fvz4gG2jo6MDviTTWx4h+SKCehz1cTl9IRiEG53duFwufkDBNOQXzEJuwUzkF8wSbG6ZnZctonirNGTIEA0bNkwffvihnnjiiQYXb5J09tln+4+3b9/e6LnTM1Lldrsb3a9xPCaPDwAAAKC5sv1uk7/UvXt3SdIPP/xgcSQAAAAAEDotrnirXDVLTExsVL/PPvvMf5ySkhLSmAAAAAAgWLYp3ioqKlTf3irZ2dn6/PPPJUkjRozwn6+vX1lZme68805JUnx8vM4999zgggUAAACAELNN8bZz504NGjRIS5cu1fbt26sVZDt37tTf/vY3XXLJJTIMQ+3bt9ett97qv/7hhx9q1KhReu6557Rr1y7/ea/Xq+zsbA0fPlzr1q2TJM2bN09t27YN2+MCAAAAgIaw1YYl//rXv/THP/5RkhQVFaWkpCQdOXLE/zlv0vGXPK5cuVJdunTxnzMMQ9nZ2crOzpYkxcbGKj4+XocOHfJ/kJ7T6dTtt9+u2267LYyPCAAAAAAaxjbFW7du3fTyyy9rzZo1WrdunXbv3q2CggJFREToxBNP1MCBA3XJJZfoqquuUmxsbLW+/fv315IlS/Tpp5/q66+/VkFBgQoLCxUXF6fTTjtNw4cP1w033KD+/fs3Ob4pE3IkX0GwD7NOmWkrGtZwgMfUOAAAAACEn22Kt6ioKE2YMEETJkxodF+3261Zs2aZEBUAAAAAhIdt3vMGAAAAAK0ZxRsAAAAA2ADFGwAAAADYAMUbAAAAANiAbTYsae7SM1LldrtNnsVj8vgAAAAAmitW3gAAAADABijeAAAAAMAGKN4AAAAAwAYo3gAAAADABijeAAAAAMAG2G0yRKZMyJF8BWGZKzNtRVjmgcV8TkkDpc0LJafP6mhCZ4DH6ggAAABsiZU3AAAAALABijcAAAAAsAGKNwAAAACwAYo3AAAAALABijcAAAAAsAF2mwyR9IxUud3uMM3mCdM8sJTXK+WtlvrOlVwuq6MBAACAxVh5AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABtgt8kQmTIhR/IVmDpHZtoKU8dHM+NzShoobV4oOX1WR4OWxsr8GuAJ73wAALQQrLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIikZ6TK7XabPIvH5PHRrHi9Ut5qqe9cyeWyOhq0NOQXAAC2w8obAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAxRsAAAAA2AC7TYbIlAk5kq8g7PNmpq0Iz0QDPOGZBwAAAEBArLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADdhqt8mNGzcqMzNTX3zxhbZt26aff/5ZRUVFSkpKUp8+fXTBBRfoxhtvVPv27WsdY+/evbr//vu1atUq/fjjj4qNjVXfvn01efJkXXvttXI4HE2KLT0jVW63u6kPLQgeC+YEAAAAEG62Kt7+8Y9/6NFHH/V/HxMTo9jYWB04cEBr167V2rVr9fDDD+v111/X2WefXaP/F198ofPOO0/79++XJCUkJKi4uFgff/yxPv74Y2VkZOj1119XVFRU2B4TAAAAADSErV42OWTIEC1evFiffvqpDh48qCNHjqioqEjFxcVatmyZOnbsqIKCAo0bN06HDh2q1vfQoUO66KKLtH//fvXp00fr169XcXGxDh8+rEceeUQul0tvv/22ZsyYYc2DAwAAAIA62Kp4mzRpkmbPnq1f//rXatu2rf98QkKCJk2apOeff16StG/fPq1atapa3yVLlmjPnj2KjY3V6tWrNXjwYElSVFSU/vznP2vBggWSpCeeeELbtm0LzwMCAAAAgAayVfFWn1//+tf+4127dlW79uyzz0qSrrzySqWkpNToe/PNNyshIUEVFRVavny5uYECAAAAQCPZ6j1v9fnoo4/8x6mpqf7jrVu36scff5QkjR07NmDfhIQEDR8+XG+++abeeecd/0pcQ02ZkCP5CpoQdXAy01aEfU6Eic8paaC0eaHk9FkdDZpqgMfqCAAAQAth+5W3srIy7dixQ4888oiuueYaSVLPnj118cUX+9t88803/uN+/frVOlbltW+//dakaAEAAACgaWy78hYTE6OysrIa54cOHaoXXnhB0dHR/nO7d+/2H3fv3r3WMSuvFRUVqaSkRAkJCTXalJWVVZu3qKhIkuSKrJBU0ejHESyvz/b1N2pR+dzyHNuc12t1BAF5/x2Xt5nGB/sit2Am8gtmCVVumZ2bti3eunTpoqNHj6qkpESHDx+WJI0cOVL333+/TjzxxGpti4uL/cdxcXG1jln1WnFxccDibeHChQFfUnn5tB11jm2W1XkDwz4nwitrZ3+rQ0Aw8lZbHUGdsrKyrA4BLRS5BTORXzBLsLlVWloaokgCs23xtmPHDv/xvn379Nxzz+nee+/VkCFDdNddd+nuu+82Zd65c+dq5syZ/u+LioqUnJysl//RQ1IbU+asy0uLM8I+J8LD63Mqa2d/jU7+Wi7e82ZffedaHUFAXq9XWVlZGj16tFwul9XhoAUht2Am8gtmCVVuVb4qzyy2Ld6q6tSpk2bNmqXhw4fr7LPP1l//+lcNGTJEF110kSQpMTHR37a0tFRJSUkBx6laKVftU1V0dHS1l2RW8pZHSL6IYB5Gk/BLfcvncvp4nu2smf9y4XK5+AUIpiC3YCbyC2YJNrfMzssWUbxVGjJkiIYNG6YPP/xQTzzxhL9469atm79Nfn5+rcVbfn6+JCkpKSngSybrkp6RKrfb3cTIg+GxYE6Ehdd7/CV3fec2+wIAAAAA5mtxOyFUbjryww8/+M9V3WGy6s6Tv1R57bTTTjMpOgAAAABomhZXvG3fvl1S9Zc99urVy7+JyVtvvRWw3+HDh/2fEzdmzBiTowQAAACAxrFN8VZRUSHDMOpsk52drc8//1ySNGLECP95h8OhSZMmSZJefPHFapudVHr00UdVUlKiiIgIXX311SGLGwAAAABCwTbF286dOzVo0CAtXbpU27dvr1bI7dy5U3/72990ySWXyDAMtW/fXrfeemu1/rNnz1aXLl1UWlqqCy+8UF988YUk6dixY3r88cf13//935KkG264Qb169QrfAwMAAACABrDVhiX/+te/9Mc//lGSFBUVpaSkJB05csT/OW+SlJKSopUrV6pLly7V+rZp00arVq3Seeedp2+//VaDBw9WYmKijh496v8wvTFjxuihhx4K3wMCAAAAgAayTfHWrVs3vfzyy1qzZo3WrVun3bt3q6CgQBERETrxxBM1cOBAXXLJJbrqqqsUGxsbcIwzzjhDmzdv1qJFi7Rq1Srt3LlT8fHx6tevnyZPnqxp06bJ6WzaYuSUCTmSryCYhxhQZtqKkI8Jm/A5JQ2UNi+U6vuogAGecEQEAAAAC9mmeIuKitKECRM0YcKEoMbp3LmzHnzwQT344IMhigwAAAAAzGeb97wBAAAAQGtG8QYAAAAANkDxBgAAAAA2QPEGAAAAADZgmw1Lmrv0jFS53W4TRvaYMCZsweuV8lZLfedKLpfV0QAAAMBirLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIhMmZAj+QpCPm5m2oqQj1mnAZ7wzgcAAACgQVh5AwAAAAAboHgDAAAAABsw/WWTX3/9td599105nU6dd9556tOnj9lTAgAAAECLE/TK23vvvaff/e53uuOOO2pce/DBBzVo0CDNnj1bM2fOVP/+/ZWWlhbslAAAAADQ6gRdvL388sv64IMP1KNHj2rnt23bpr/85S/y+XyKiopSbGysKioqdOutt2rTpk3BTgsAAAAArUrQL5tcu3atJGns2LHVzj/11FOqqKjQb3/7W61atUpRUVG6+uqrlZGRoccee0xPPvlksFM3K+kZqXK73SaM7DFhTAAAAAB2E/TK2759+xQREaETTjih2vm33npLDodD8+bNU3x8vFwulxYuXChJ+vDDD4OdFgAAAABalaCLtwMHDigpKUkOh8N/rri4WJs3b1Z8fLx++9vf+s+npqYqJiZGu3btCnZaAAAAAGhVgi7eYmJidOjQIRmG4T+3du1aGYahs846S05n9SliY2ODnRIAAAAAWp2gi7eePXvK5/Ppgw8+8J975ZVX5HA4NGzYsGptjx07pkOHDqlz587BTgsAAAAArUrQG5ZceOGF2rRpk6699lrdd999+umnn5Seni5JGj9+fLW2mzZtks/n04knnhjstAAAAADQqgRdvM2cOVPLli1Tbm6urrrqKkmSYRi64oor1L9//2ptX3vttYArci3BlAk5kq8grHNmpq0I63ymG+CxOgIAAACg2Qq6eGvbtq3Wrl2r+fPn69NPP1Xbtm110UUXac6cOdXaHTt2TP/4xz9kGIZGjhwZ7LQAAAAA0KoEXbxJUvfu3fXUU0/V2SYqKkp79uwJxXQAAAAA0OoEvWEJAAAAAMB8IVl5+6W8vDzt27dPktSpUyeddNJJZkwDAAAAAK1GyFbedu/erZtvvlmdOnXSySefrF//+tf69a9/rZNPPlkdO3bUzTffzIdzAwAAAEAThWTl7Z133tEVV1yhoqKiah/WXWn//v167LHH9Nxzz+nFF1/U+eefH4ppm5X0jFS53e4wz+oJ83wAAAAArBJ08bZ161aNGzdOR48eVfv27fXHP/5Rv/vd79S9e3dJUn5+vt5//30tXbpUBQUFGj9+vDZt2qTevXsHHTwAAAAAtBZBF29//etfdfToUQ0YMEBZWVnq2LFjteu9e/fW7373O91yyy0aNWqUvv76a91zzz167rnngp0aAAAAAFqNoN/zlp2dLYfDoaeeeqpG4VZVhw4d9OSTT8owDL377rvBTgsAAAAArUrQxVthYaESEhI0ePDgetueeeaZSkhIUGFhYbDTAgAAAECrEnTx1rVrV1VUVDS4vc/nU9euXYOdFgAAAABalaDf83bBBRfo8ccf13vvvaff/e53dbbNzs5WaWmpLrroomCnbXamTMiRfAVhnTMzbUVY5zPdAI/VEQAAAADNVtArb//93/+tTp066dprr9W2bdtqbff999/r+uuvV9euXXXXXXcFOy0AAAAAtCoh+aiAhQsX6tZbb9XAgQN1+eWXB/yogJdfflkxMTF66KGHtGXLFm3ZsqXGWL/5zW+CDQcAAAAAWqSgi7cRI0bI4XD4v1++fLmWL18esG1ZWZmmTZsW8JrD4VB5eXmw4QAAAABAixR08SZJhmE0izEAAAAAoKUKunjz+XyhiAMAAAAAUIeQrLyFy/79+/X6668rOztbGzduVF5ensrLy9WxY0cNHjxYkydP1qWXXhqwb3p6uqZOnVrvHFlZWRo1alSjY0vPSJXb7W50v+B4wjwfAAAAAKvYqnjr0qVLtffFxcTEyOVyKT8/X/n5+Xrttdc0duxYZWRkKC4uLuAYTqdTHTt2rHWO6OjokMcNAAAAAMEK+qMCwqm8vFxDhgzRY489ppycHB05ckQlJSXKzc3VtddeK0l68803NX369FrHSE5O1p49e2r9Gj58eLgeDgAAAAA0WKNW3p599llJUps2bXTJJZdUO9dYkyZNanSf9957TyNHjqxxvkePHnrqqacUGRmppUuX6vnnn9d9992n5OTkJsUGAAAAAM1No4q3KVOmyOFwqHfv3v7irfJcYzgcjiYVb4EKt6quvfZaLV26VJK0YcMGijcAAAAALUajircTTzxRDodD3bp1q3GuOYiJifEfV1RUWBgJAAAAAIRWo4q3HTt2NOicVdasWeM/7t+/f8A2P//8s8444wxt3bpVFRUV6tq1q8455xxdd911GjFiRJPnnjIhR/IVNLm/2TLTVlgdQngN8FgdAQAAABBSttqwpC6FhYVauHChJGn48OHq3bt3wHalpaXauHGjoqKi5PP5lJubq+XLl2vkyJGaNm1atd0sAQAAAKC5sNVHBdTG5/Ppmmuu0U8//aSYmBg98sgjNdp069ZN8+fP1/jx49W7d29FR0eroqJC69at0/z58/Xuu+/qmWeeUXx8vNLS0mqdq6ysTGVlZf7vi4qKJEmuyApJzfelml5fi6nTG8brtTqCoHn//Ri8LeCxoPkhv2AWcgtmIr9gllDlltm56TAMwzB1hjC4+eab/QXb008/rWnTpjWqv8/n0/jx4/Xaa6/J6XRqy5YtOuWUUwK29Xg8WrBgQY3zL7zwQq2fLQcAAACg5SstLdVVV12lQ4cOKSkpKeTjh6x4Ky4u1qpVq/TVV1/pwIEDdVadDodDTz/9dCim1ezZs/XAAw9Ikh566CHNmDGjSeP88MMP/oLtgQce0MyZMwO2C7TylpycrEtHrZXUpklzh8NLizOsDiG8+s61OoKgeb1eZWVlafTo0XK5XFaHgxaG/IJZyC2YifyCWUKVW0VFRerQoYNpxVtIXjaZnp6uW265RSUlJf5zgWpCh8MhwzBCVrzddttt/sJtyZIlTS7cJKlnz57q0KGDCgoKtH379lrbRUdHKzo6usZ5b3mE5Ito8vxmczl9VocQXi3ohu5yufgBBdOQXzALuQUzkV8wS7C5ZXZeBl28vf3227r22mtlGIZiYmJ09tlnq1u3boqMNPftdHPmzNGSJUskSffff79mzZpl6nz1Sc9IldvttjSGunmsDgAAAABAEIKusO6//34ZhqGzzz5br732mjp06BCKuOpU9aWS999/v+bMmRP0mDk5OSooOL7Vf0pKStDjAQAAAEAoBV28ffHFF3I4HEpPTw974bZkyZIGrbhVvlSzruuVBaDT6dRFF10UmmABAAAAIESC3j++vLxcCQkJte7OGEpV3+P24IMPNvilknl5eRoyZIiWLl2q7du3+9+P5/P59Nlnn2ns2LF69dVXJUnTp0+v9TPiAAAAAMAqQa+8paamauvWraqoqFBEhHkbdvz4449avHixpOOrY4sWLdKiRYtqbT979mzNnj3b//369eu1fv16Scc3HUlMTFRxcXG1nSOnTp2qv//97yY9AgAAAABouqCLt4kTJ+r222/Xm2++aerLDX0+X7XjvXv31tm+6s6XnTt3Vlpamj799FN9+eWX+vnnn3Xw4EHFxMQoJSVF55xzjqZNm6ahQ4eaFj8AAAAABCPo4m3GjBlauXKl/vSnP6l3796mvXyyR48eAT9+oCFiY2N100036aabbgpxVP8xZUKO5CswbfxAMtNWhHU+hJnPKWmgtHmh1No+6gGhN8BjdQQAACBIjSrenn322YDnr7nmGs2bN08DBw7UhAkTdNZZZykxMbHOsSZNmtSYqQEAAACgVWtU8TZlypR6d21cvny5li9fXuc4DoeD4g0AAAAAGqFRxduJJ55YZ/EGAAAAADBHo4q3HTt2mBQGAAAAAKAuQW9YguPSM1LldrvDPKsnzPMhrLxeKW+11Heu5HJZHQ0AAAAsZlrxduzYMb311lvaunWroqOjdfrpp2vYsGFmTQcAAAAALVqji7fi4mK9+uqrkqQrrrhC0dHRNdps2LBBl112mXbt2lXt/FlnnaVXXnlFXbp0aWK4AAAAANA6ORvbITs7W1OmTNHDDz8csHDbt2+fLrjgAu3atUuGYVT7WrdunX7/+9+HJHAAAAAAaE0aXbx99NFHkqSrrroq4PVFixapoOD4h1VPnjxZn3zyif71r3/p1ltvlWEY+uKLL5SRkRFEyAAAAADQ+jT6ZZOff/65HA6Hzj///IDXly9fLofDoYsvvljPPPOM//wDDzygAwcOaNmyZVq5cqUmTJjQ9KgBAAAAoJVpdPH2008/KTIyUqeddlqNa5s3b9a+ffvkcDj0//7f/6tx/ZZbbtGyZcu0adOmpkXbjE2ZkCP5CsI6Z2bairDOhzDzOSUNlDYvlJy+xvcf4Al1RAAAALBQo182uXfvXiUlJcnprNn1888/lyRFRUUF3FmyX79+cjgc2r17dxNCBQAAAIDWq9HFW0VFhYqKigJe++KLLyRJp556qqKiompcj4yMVLt27XTkyJHGTgsAAAAArVqji7dOnTqpvLxcOTk5Na59+umncjgcOvPMM2vtX1JSovj4+MZOCwAAAACtWqOLt9NPP12S9MQTT1Q7//333+vLL7+UJP32t78N2DcvL0/Hjh3TCSec0NhpAQAAAKBVa3Tx9oc//EGGYeihhx7S4sWLtXXrVmVnZ+vyyy+XYRiKj4/XxRdfHLDvhx9+KOn4e98AAAAAAA3X6N0mL7/8cj366KP68MMPdfvtt+v222/3X3M4HJo5c6YSExMD9n3ppZfkcDgCbmZid+kZqXK73WGe1RPm+RBWXq+Ut1rqO1dyuayOBgAAABZr9MqbJL322mu66KKLZBiG/0uSrrvuOs2bNy9gn++//15vvfWWJOmCCy5oYrgAAAAA0Do1euVNktq0aaPXX39dP/zwg/99bmeeeaZOOumkWvu4XC699tprcrlcOvnkk5sULAAAAAC0Vk0q3ir17NlTPXv2bFDbHj16qEePHsFMBwAAAACtVpNeNgkAAAAACC+KNwAAAACwgaBeNon/mDIhR/IVhH3ezLQVYZ8TYeJzShoobV4oOX2N7z/AE+qIAAAAYCFW3gAAAADABijeAAAAAMAGKN4AAAAAwAYo3gAAAADABijeAAAAAMAG2G0yRNIzUuV2uy2Y2WPBnAgLr1fKWy31nSu5XFZHAwAAAIux8gYAAAAANkDxBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2wG6TITJlQo7kK7A0hsy0FTVPDvCEPQ4AAAAAocfKGwAAAADYAMUbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAu02GSHpGqtxut8VReCyeHwAAAIBZbLPytn//fj3zzDOaOHGiTjvtNMXHxys6OlonnHCCxo0bp1dffbXeMYqLi+XxeNS/f38lJCSoTZs2OvPMM/XAAw/o2LFjYXgUAAAAANA0tll569Kli8rLy/3fx8TEyOVyKT8/X/n5+Xrttdc0duxYZWRkKC4urkb/vLw8jRgxQjt27JAkxcXFqaysTBs2bNCGDRu0fPlyZWdnq127duF6SAAAAADQYLZZeSsvL9eQIUP02GOPKScnR0eOHFFJSYlyc3N17bXXSpLefPNNTZ8+PWDfiy++WDt27FDXrl2VlZWlw4cPq7S0VC+++KISExO1adMmTZw4MdwPCwAAAAAaxDbF23vvvad169bpxhtv1Mknn+w/36NHDz311FP+ou3555/Xzp07q/VdtmyZvv76a0nSypUrNWrUKEmS0+nUFVdcoaVLl0qSVq9erezs7HA8HAAAAABoFNsUbyNHjqzzeuXqmyRt2LCh2rVly5b5xzj77LNr9L3yyiuVkpIiSXr22WeDDRUAAAAAQs4273mrT0xMjP+4oqLCf1xaWqpPPvlEkjR27NiAfR0Oh84//3w9/vjjeuedd5o0/5QJOZKvoEl9g5GZtiLsc4bVAI/VEQAAAADNgm1W3uqzZs0a/3H//v39x9999518Pp8kqV+/frX2r7y2Z88eHThwwJwgAQAAAKCJWsTKW2FhoRYuXChJGj58uHr37u2/tnv3bv9x9+7dax2j6rXdu3erffv2AduVlZWprKzM/31RUZEkyRVZIakiYB8zeX0tpv4OzOu1OgLLeP/92L2t+N8A5iG/YBZyC2Yiv2CWUOWW2blp++LN5/Ppmmuu0U8//aSYmBg98sgj1a4XFxf7jwN9hECga1X7/NLChQu1YMGCGucvn7ajzvHNsjpvYNjnDKu81VZHYLmsrCyrQ0ALRn7BLOQWzER+wSzB5lZpaWmIIgnM9sXbLbfcolWrVkmSHn30UQ0YMMDU+ebOnauZM2f6vy8qKlJycrJe/kcPSW1MnTuQlxZnhH3OsOo71+oILOP1epWVlaXRo0fL5XJZHQ5aGPILZiG3YCbyC2YJVW5VvirPLLYu3mbPnu1faXvooYc0bdq0Gm0SExP9x3VVwlWvVe3zS9HR0YqOjq5x3lseIfkiGhR3KLmcvrDPGVbcmOVyufgBBdOQXzALuQUzkV8wS7C5ZXZe2rZ4u+222/TAAw9IkpYsWaIZM2YEbNetWzf/cX5+fq0rc/n5+QH7NFR6Rqrcbnej+wXPY8GcAAAAAMLNlrtdzJkzR4sXL5Yk3X///Zo1a1atbU899VQ5nccf5jfffFNru8prXbp0qXWzEgAAAACwiu2Kt9mzZ2vJkiWSjhduc+bMqbN9XFychg4dKkl66623ArYxDENvv/22JGnMmDEhjBYAAAAAQsNWxdvs2bOrvVSyvsKt0uTJkyVJ77//vtatW1fj+ssvv6zt27dLkiZNmhSiaAEAAAAgdGxTvFV9j9uDDz5Y50slf2ny5Mnq37+/DMPQZZddpuzsbEnHP2bg5Zdf1vXXXy9JGjt2rM4999zQBw8AAAAAQbJF8fbjjz/63+PmdDq1aNEidenSpdavypdVVoqMjNTrr7+uHj16KD8/X6NGjVJ8fLzi4+P1X//1XyoqKtKgQYO0fPlyKx4eAAAAANTLFrtN+ny+asd79+6ts31JSUmNcz169NBXX32lJUuW6JVXXlFubq5cLpf69u2rP/zhD7r55psVFRXV5BinTMiRfAVN7v9LmWkrQjaWZQZ4rI4AAAAAaDFsUbz16NFDhmEEPU5iYqIWLFigBQsWhCAqAAAAAAgfW7xsEgAAAABaO4o3AAAAALABijcAAAAAsAGKNwAAAACwAVtsWGIH6RmpcrvdIRzRE8KxAAAAANgdK28AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgA+w2GSJTJuRIvgLT58lMW2H6HGgmfE5JA6XNCyWnz+po0NKQXzBL1dz61TyrowGAFoWVNwAAAACwAYo3AAAAALABijcAAAAAsAGKNwAAAACwAYo3AAAAALABdpsMkfSMVLnd7jDM5AnDHGgWvF4pb7XUd67kclkdDVoa8gtmqZpbAICQYuUNAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyADUtCZMqEHMlXEJa5MtNWhGUeWMznlDRQ2rxQcvqsjgYtTWvMrwEeqyMAACAorLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIikZ6TK7XaHaTZPmOaBpbxeKW+11Heu5HJZHQ1aGvILAADbYeUNAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDdJkNkyoQcyVdgdRjVZKatsDoEBMPnlDRQ2rxQcvqsjgYtDfmFXxrgsToCAEA9WHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2C3yRBJz0iV2+22Ooxf8FgdAILh9Up5q6W+cyWXy+po0NKQXwAA2I6tVt5KS0v15ptv6p577tH48eN10kknyeFwyOFwyOPx1NnX4/H429b19cMPP4TnwQAAAABAI9hq5e3zzz/XBRdcENQYLpdL7du3r/V6ZKSt/kkAAAAAtBK2q1TatWun008/3f916623as+ePQ3uf84552jNmjXmBQgAAAAAJrBV8TZ8+HAdOHCg2rnbb7/domgAAAAAIHxs9Z63iIgIq0MAAAAAAEvYauWtOZsyIUfyFVg2f2baCsvmhkl8TkkDpc0LJafP6mjQ0pBfMEtzz60BHqsjAIAms9XKWyhs3rxZ/fr1U1xcnBISEtS7d29df/312rRpk9WhAQAAAECtWt3KW0FBgQ4cOKC2bduqqKhI27Zt07Zt2/T000/rjjvu0D333FNn/7KyMpWVlfm/LyoqkiS5IiskVZgZep28vlZXh7d4lc8pzy3MQH7BLM0+t7xeqyNAELz/fv68PI8IsVDlltm52WqKt1NOOUX333+/LrnkEqWkpMjlcunYsWNas2aN7rjjDn3xxRe699571a5dO82aNavWcRYuXKgFCxbUOH/5tB2Ki4sz8yHUaXXeQMvmhrmydva3OgS0YOQXzNJscytvtdURIASysrKsDgEtVLC5VVpaGqJIAnMYhmGYOoPJevTooby8PM2fP7/eD+quzdGjR/Wb3/xG69evV0JCgnbt2qU2bdoEbBto5S05OVmXjlorKXCfcHhpcYZlc8McXp9TWTv7a3Ty13I1x/eNwNbIL5il2edW37lWR4AgeL1eZWVlafTo0XK5XFaHgxYkVLlVVFSkDh066NChQ0pKSgphhMe1mpW3usTExOi+++7T6NGjVVJSouzsbI0fPz5g2+joaEVHR9c47y2PkHzW7YbZLH9AIiRcTh/PL0xDfsEszTa3+IW/RXC5XBRvMEWwuWV2XlK8/dvZZ5/tP96+fXuj+6dnpMrtdocypEbyWDg3TOH1Hn95T9+5/LKB0CO/YBZyCwBM00zfTQwAAAAAqIri7d8+++wz/3FKSoqFkQAAAABATa2ieKtvT5aysjLdeeedkqT4+Hide+654QgLAAAAABrMdsXbwYMHVVBQ4P/y+Y6/Gbq0tLTa+ZKSEn+fDz/8UKNGjdJzzz2nXbt2+c97vV5lZ2dr+PDhWrdunSRp3rx5atu2bVgfEwAAAADUx3YblgwaNEh5eXk1zi9evFiLFy/2fz958mSlp6dLOr7ylp2drezsbElSbGys4uPjdejQIf8H6TmdTt1+++267bbbzH8QAAAAANBItivemqJ///5asmSJPv30U3399dcqKChQYWGh4uLidNppp2n48OG64YYb1L9/0z9QdMqEHMlXEMKo65eZtiKs8yHMfE5JA6XNC6XK7bYHeKyMCAAAABayXfG2Y8eORvdxu92aNWtW6IMBAAAAgDCx3XveAAAAAKA1ongDAAAAABugeAMAAAAAG6B4AwAAAAAbsN2GJc1Vekaq3G53mGf1hHk+hJXXK+WtlvrOlVwuq6MBAACAxVh5AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABtgt8kQmTIhR/IVmD5PZtoK0+dAM+FzShoobV4oOX3WxjLAY+38AAAAYOUNAAAAAOyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDdJkMkPSNVbrc7DDN5wjAHmgWvV8pbLfWdK7lcVkcDAAAAi7HyBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADbAbpMhMmVCjuQrCNt8mWkrwjZXnQZ4rI4AAAAAaBVYeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAbYLfJEEnPSJXb7Q7jjJ4wzgUAAADAaqy8AQAAAIANULwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA2w22SITJmQI/kKwjpnZtqKsM6HMPM5JQ2UNi+UnL7j5wZ4rIwIAAAAFmLlDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgA1LQiQ9I1VutzvMs3rCPB/CyuuV8lZLfedKLpfV0QAAAMBitll5Ky0t1Ztvvql77rlH48eP10knnSSHwyGHwyGPx9OgMfbu3atZs2apd+/eio2NVfv27TV8+HA99dRTMgzD3AcAAAAAAEGwzcrb559/rgsuuKDJ/b/44gudd9552r9/vyQpISFBxcXF+vjjj/Xxxx8rIyNDr7/+uqKiokIVMgAAAACEjG1W3iSpXbt2OvfcczVnzhytWLFCXbp0aVC/Q4cO6aKLLtL+/fvVp08frV+/XsXFxTp8+LAeeeQRuVwuvf3225oxY4a5DwAAAAAAmsg2K2/Dhw/XgQMHqp27/fbbG9R3yZIl2rNnj2JjY7V69WqlpKRIkqKiovTnP/9ZRUVFuuOOO/TEE09oxowZ6tWrV8jjBwAAAIBg2GblLSIiosl9n332WUnSlVde6S/cqrr55puVkJCgiooKLV++vMnzAAAAAIBZbLPy1lRbt27Vjz/+KEkaO3ZswDYJCQkaPny43nzzTb3zzjtasGBBo+eZMiFH8hUEFWswMtNWWDY3TOJzShoobV4oOX1WR1O3AR6rIwAAAGjxbLPy1lTffPON/7hfv361tqu89u2335oeEwAAAAA0Votfedu9e7f/uHv37rW2q7xWVFSkkpISJSQkBGxXVlamsrIy//dFRUWSJFdkhaSKEETcNF5fi6/DW53K59QWz63Xa3UEaCTvv58zL88dQozcgpnIL5glVLlldm62+OKtuLjYfxwXF1dru6rXiouLay3eFi5cGPBllZdP21Hn+GZbnTfQsrlhrqyd/a0OoX55q62OAE2UlZVldQhoocgtmIn8glmCza3S0tIQRRJYiy/eQm3u3LmaOXOm//uioiIlJyfr5X/0kNTGsrheWpxh2dwwh9fnVNbO/hqd/LVczf09b33nWh0BGsnr9SorK0ujR4+Wy+WyOhy0IOQWzER+wSyhyq3KV+WZpcUXb4mJif7j0tJSJSUlBWxXtUqu2ueXoqOjFR0dXeO8tzxC8jV9R8xgNftf7tFkLqev+T+//AC1LZfLxS9AMAW5BTORXzBLsLlldl62+OKtW7du/uP8/Pxai7f8/HxJUlJSUq0vmaxLekaq3G5304IMCY+Fc8MUXu/xlyP2nUtxBAAAgJa/22TVHSar7jz5S5XXTjvtNNNjAgAAAIDGavHFW69evXTiiSdKkt56662AbQ4fPqyPPvpIkjRmzJiwxQYAAAAADdXiizeHw6FJkyZJkl588UXt2LGjRptHH31UJSUlioiI0NVXXx3mCAEAAACgfrYq3g4ePKiCggL/l893fBOH0tLSaudLSkqq9Zs9e7a6dOmi0tJSXXjhhfriiy8kSceOHdPjjz+u//7v/5Yk3XDDDerVq1d4HxQAAAAANICtirdBgwapY8eO/q+dO3dKkhYvXlzt/E033VStX5s2bbRq1Sq53W59++23Gjx4sH9jkj/96U86duyYxowZo4ceesiKhwUAAAAA9Wrxu01WOuOMM7R582YtWrRIq1at0s6dOxUfH69+/fpp8uTJmjZtmpzOpteyUybkSL6CEEbcMJlpK8I+p+0N8FgdAQAAANBotireAr1frTE6d+6sBx98UA8++GBoAgIAAACAMLHVyyYBAAAAoLWieAMAAAAAG6B4AwAAAAAboHgDAAAAABuw1YYlzVl6RqrcbrcFM3ssmBMAAABAuLHyBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADbAbpMhMmVCjuQrMG38zLQVpo0dcgM8VkcAAAAAtDisvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADVC8AQAAAIANsNtkiKRnpMrtdps4g8fEsQEAAAA0d6y8AQAAAIANULwBAAAAgA1QvAEAAACADfCeNwAAAKCRDMOQ1+uVz+ezOhSEgNfrVWRkpI4ePSrDMORyueRwOKwOqwaKNwAAAKCBKioqVFBQoOLiYnm9XqvDQYgYhqEuXbpo586dcjgccrlcSkxMVIcOHRQREWF1eH4UbyEyZUKO5CswbfzMtBWhHXCAJ7TjAQAAtHAVFRXauXOnysrK1KZNGyUkJCgiIqJZrtCgcXw+n0pKShQfHy/DMFRSUqLCwkIdOXJEycnJzaaAo3gDAAAAGqCgoEBlZWU68cQTFRsba3U4CCGfz6djx44pNjZWTqdTCQkJatOmjX788UcVFBSoc+fOVocoiQ1LAAAAgHoZhqHi4mK1adOGwq2ViI2NVVJSkoqLi2UYhtXhSKJ4AwAAAOrl9Xrl9XqVkJBgdSgIo8TERP9z3xxQvAEAAAD1qNxVsrm89wnhUfl8N5ddRSneAAAAgAZic5LWpbk932xYEiLpGalyu90mzuAxcWwAAAAAzR0rbwAAAABgAxRvAAAAAGADFG8AAAAAYAO85w0AAAAIpa88VkcQGgM8pg5vGIYyMjL0wgsvaOPGjdq3b58iIiLUuXNnde3aVUOGDNHw4cN17rnnKikpyd9vypQpWrZsWbWxqn6w9sknn6xBgwbpvPPO05gxY+R0tpz1Koo3AAAAAGFVWFiocePG6YMPPvCfi4yMVFxcnH788Udt375dn3zyiR566CE988wzmjJlSo0xnE6nOnbs6P/+8OHD2rlzp3bu3KkPPvhADz/8sJKTk/XQQw/psssuC8fDMh3FW4hMmZAj+QpMGz8zbUVoBzT5LykAAABAbSZNmqQPPvhAERERmjFjhqZPn67U1FQ5nU6Vl5fr22+/1VtvvaUXXnih1jGSk5O1Y8eOaueOHTumr776Sm+88YYef/xx7dy5UxMmTNDcuXN13333mfyozNdy1hABAAAANHvff/+9MjMzJUn33HOPlixZolNOOcX/8sbIyEgNGDBAt912m7788ktdccUVDR47KipKgwcP1vz587V582aNHDlSkrRw4cI6C0G7oHgDAAAAEDZffvml//iSSy6pt31sbGyT5nG73XrllVfUvXt3SdJdd90lr9fbpLGaC4o3AAAAAJbYtWuXqeO3bdtWM2bMkCTl5ubqo48+MnU+s1G8AQAAAAibM888Uw6HQ5I0a9Ysbdu2zdT5LrzwQv9x1Q1S7IjiDQAAAEDY9OjRQ9ddd50k6euvv1afPn10+umn689//rP+8Y9/6JtvvpFhGCGbr0+fPoqKipIk5eTkhGxcK7Sq3SbT09M1derUettlZWVp1KhRjRs7I1Vut7upoTWAx8SxAQAAgPB57LHH1KVLFz344IM6fPiwNm3apE2bNvmvd+rUSVdffbX+8pe/qHPnzkHN5XA41K5dO+3du1cHDhwINnRLtcqVN6fTqc6dO9f6FR0dbXWIAAAAQIsVGRmpu+++W/n5+Xruued03XXXaeDAgf4Vsn379umhhx5Sv3799Pnnn1scbfPRqlbeKgX6TAgAAAAA4dWmTRtNnDhREydOlCQdPXpUH3/8sf7+978rMzNTBQUFuuyyy/T9998rJiamSXMYhqHCwkJJMvmVcuZrlStvAAAAAJqfmJgYjRo1Sq+//romT54s6fiOlG+99VaTx9yyZYvKysokSampqSGJ0yoUbwAAAACanRtuuMF/vHXr1iaP88Ybb/iPR4wYEUxIlqN4AwAAANDsJCQk+I+buidFYWGh/ud//kfS8VW3YcOGhSQ2q7TK97z9/PPPOuOMM7R161ZVVFSoa9euOuecc3Tdddc1uRqfMiFH8hWENtBfyExbYer4TTbAY3UEAAAAsInc3Fx5vV716tWrznbLli3zH59++umNnufAgQO6/PLL/R8Efu+99yoy0t7lT6tceSstLdXGjRsVFRUln8+n3NxcLV++XCNHjtS0adNUXl5udYgAAABAi7R582adeuqpuvDCC/Xss89W20jQ6/Vq06ZNmjp1qh588EFJ0pAhQxq8Yub1evXFF1/o7rvv1mmnnab33ntPknTXXXfpiiuuCPljCTd7l56N1K1bN82fP1/jx49X7969FR0drYqKCq1bt07z58/Xu+++q2eeeUbx8fFKS0sLOEZZWZn/DY+SVFRUJElyRVZIqjA1fq+vmdbaXq/VEbRI3n//u3r594UJyC+YhdyCmazML6/XK8Mw5PP55PP56mzrCOEHTFvJqOdxNlVERIR8Pp9Wr16t1atXS5KioqKUkJCggwcPVvuA7tNPP10rV66UJP+/e+X1nTt3qkuXLv62R44cUXFxcbX+J554oh566CGNGzeuzuetsk/lc1zJ5/PJMAx5vV5FRETU+9jMzk2HEcqPL7cxn8+n8ePH67XXXpPT6dSWLVt0yimn1Gjn8Xi0YMGCGudfeOEFxcXFhSNUAAAAhFlkZKS6dOmi5ORk/2eR1SZm29/CFJW5jva63bSxt2/frqysLH322Wf67rvvtHv3bh0+fFixsbHq0qWLBgwYoIsuukjjxo2T01l9AeNPf/qTVqyo/nYih8OhhIQEJSYmqkePHho4cKDOPfdcjRw5skb/xjh27Jh27typPXv2NOjVeaWlpbrqqqt06NAhJSUlNXne2lC8VfHDDz/4C7YHHnhAM2fOrNEm0MpbcnKyLh21VlIbU+N7aXGGqeM3Wd+5VkfQInm9XmVlZWn06NFyuVxWh4MWhvyCWcgtmMnK/Dp69Kh27typHj16NPnzxtB8GYah4uJiJSYmyuFw+M8fPXpUO3bsUHJycoOe96KiInXo0MG04q1VvWyyPj179lSHDh1UUFCg7du3B2wTHR0dcLcbb3mE5Kt/KTUYLqc5S9dB44ezqVwuF78AwTTkF8xCbsFMVuRXRUWFHA6HnE5nUCs5aJ4qXypZ+RxXcjqdcjgcDc45s/OS4i1E0jNSw/CJ7R6TxwcAAADQXPFngypycnJUUHB8u/+UlBSLowEAAACA/2g1xVt9b+0zDENz5syRdHx59KKLLgpHWAAAAADQIK2meMvLy9OQIUO0dOlSbd++3V/M+Xw+ffbZZxo7dqxeffVVSdL06dPVu3dvK8MFAAAAgGpa1Xve1q9fr/Xr10s6vvFIYmKiiouLq+0eOXXqVP3973+3KkQAAAAACKjVFG+dO3dWWlqaPv30U3355Zf6+eefdfDgQcXExCglJUXnnHOOpk2bpqFDhzZp/CkTciRfQYijbrrMtBX1NwqFAZ7wzAMAAAC0cq2meIuNjdVNN92km266yepQAAAAAKDRWs173gAAAADAzijeAAAAAMAGKN4AAAAAwAYo3gAAAADABlrNhiVmS89IldvttjqMKjxWBwAAAAAghFh5AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAEFYej0cOh6PGV0xMjE444QT9/ve/1z//+U8ZhlGtX3p6esB+CQkJ6tatmwYPHqzrrrtOy5Yt0+HDhy16dOZht8kQmTIhR/IVWB1GSGSmrbA6BEiSzylpoLR5oeT0WR0NWhryC2Yht2CmuvJrgMeKiAK6+GKrIwiNzMzwzNO5c2f/8aFDh5Sfn6/8/HxlZmYqPT1dr776qqKjo2v069ChgyIiIiRJx44d0969e/XTTz/piy++0NNPP62bbrpJc+bM0R133KHIyJZR9rDyBgAAAMAye/bs8X8dPnxY33zzjUaPHi1JevPNN3XXXXcF7Ld+/Xp/vwMHDqi8vFxbtmzR//7v/6p///4qKSnR/PnzNXr0aJWVlYXzIZmG4g0AAABAs+B0OtW3b1+9/vrr6tmzpyRp6dKlKi8vr7evw+FQ7969NX36dG3atEkzZ86UJK1Zs0b/7//9P1PjDheKNwAAAADNSkxMjC6//HJJUnFxsbZs2dKo/hEREXrggQd04YUXSpKefvppbdu2LeRxhhvFGwAAAIBm54QTTvAfFxUVNWkMj8cjSaqoqNDzzz8firAsRfEGAAAAoNnZsWOH/7h9+/ZNGmPw4MHq1KmTJOmDDz4IRViWahnbrjQD6RmpcrvdVocRIh6rA4Akeb1S3mqp71zJ5bI6GrQ05BfMQm7BTORXq1FUVKTly5dLOl649erVq8ljDRw4UFlZWcrJyQlVeJZh5Q0AAABAs1BYWKjs7Gz97ne/0+7duyVJt9xyi5zOppctlat2Bw4cCEmMVmLlDQAAAIBlHA5HrdcmTpyoO++8M4zRNG8UbwAAAAAsU/VDuqOjo9WhQwcNGjRIV199tUaOHBn0+JUrbi3hLU4UbwAAAAAss2fPHlPH/+qrryRJqampps4TDrznDQAAAECLtGHDBu3du1eSNGLECGuDCQFW3kJkyoQcyVcQ1jkz01aEdT6Emc8paaC0eaHk9B0/N8BjZUQAAAC2smDBAknHP7R74sSJFkcTPFbeAAAAALQoFRUVmjVrllatWiVJuv7669WzZ0+LowoeK28AAAAAbM8wDP3www96//339eijj/rf63buuefqf/7nfyyOLjQo3gAAAADYzplnnqmIiAhJktfr1aFDh1RRUeG/npiYqL/85S/6y1/+osjIllH2tIxHAQAAAKBVKSj4z34TcXFx6tixo7p3765f/epX+s1vfqPLLrtM8fHxFkYYehRvAAAAQAhlZlodQfPn8Xjk8Xga3W/KlCmaMmVKyOOxC4q3EEnPSLXgg/88YZ4PYeX1Snmrpb5zJZfL6mgAAABgMXabBAAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABtgt8kQmTIhR/IV1N/QBJlpKyyZFybzOSUNlDYvlJw+q6MJ3gCP1REAAADYGitvAAAAAGADFG8AAAAAYAMUbwAAAEADGYZhdQgIo+b2fFO8AQAAAPVwOo//2lxRUWFxJAinyue78vm3WvOIAgAAAGjGXC6XXC6XSkpKrA4FYVRcXOx/7psDdpsMkfSMVLndbotm91g0L0zl9Up5q6W+c6VmcsMAAKC1cjgcSkxMVGFhodq0aaPY2FirQ4LJjhw5oqKiIrVt21YOh8PqcCS1wuKtuLhYDzzwgFauXKnc3FxFRESoV69euvLKK3XzzTcrKirK6hABAADQDHXo0EFHjhzRjz/+qKSkJCUmJioiIqLZ/GKPpvP5fDp27JiOHDkiwzBUXFysoqIiRUdHq0OHDlaH59eqire8vDyNGDFCO3bskCTFxcWprKxMGzZs0IYNG7R8+XJlZ2erXbt21gYKAACAZiciIkLJyckqKChQcXGxCgsLrQ4JIWIYho4cOaLY2Fg5HA65XC61bdtWHTp0UEREhNXh+bWa4q28vFwXX3yxduzYoa5du+rZZ5/VqFGj5PP59PLLL+v666/Xpk2bNHHiRL3xxhtWhwsAAIBmKCIiQp07d1anTp3k9Xrl8/msDgkh4PV69eGHH+q3v/2toqKi5HK5muWKaqsp3pYtW6avv/5akrRy5UqdffbZko7vHHPFFVfI5/Ppqquu0urVq5Wdna1zzz3XynABAADQjDkcDt5u04JERESovLxc0dHRzWZzkkBazW6Ty5YtkySNHDnSX7hVdeWVVyolJUWS9Oyzz4Y1NgAAAACoT6tYeSstLdUnn3wiSRo7dmzANg6HQ+eff74ef/xxvfPOO42eY8qEHMlXEFSclTLTVoRkHNiczylpoLR5oeQ0+SUZAzzmjg8AAICgtYqVt++++87/euR+/frV2q7y2p49e3TgwIGwxAYAAAAADdEqVt52797tP+7evXut7ape2717t9q3b1+jTVlZmcrKyvzfFxUVSZJckRWSKkIQreT1tYqaGvWozIOw5IPXa/4caFa8/37OvTz3CDFyC2Yiv2CWUOWW2bnZKoq34uJi/3FcXFyt7apeq9qnqoULF2rBggU1zl8+bUedYzfG6ryBIRkHLUPWzv7mT5K32vw50CxlZWVZHQJaKHILZiK/YJZgc6u0tDREkQTWKoq3UJo7d65mzpzp/76oqEjJycl6+R89JLUJyRwvLc4IyTiwN6/Pqayd/TU6+Wu5zH7PW9+55o6PZsfr9SorK0ujR49u1rtqwX7ILZiJ/IJZQpVbla/KM0urKN4SExP9x3VVw1WvVe1TVXR0tKKjo2uc95ZHSL7QfICf6b+ow1ZcTp/5OcEPwFbL5XLxCxBMQW7BTOQXzBJsbpmdl62ieOvWrZv/OD8/XwMGDAjYLj8/P2CfhkjPSJXb7W5agDV4QjQObM3rPf5yxr5zKa4AAADQOnabPPXUU+V0Hn+o33zzTa3tKq916dIl4GYlAAAAAGCVVrHyFhcXp6FDh+qjjz7SW2+9pTlz5tRoYxiG3n77bUnSmDFjGjy2YRiSjm9wwvI9Qsnr9aq0tFRFRUXkFkKO/IJZyC2YifyCWUKVW5XveausEUKtVRRvkjR58mR99NFHev/997Vu3TqdddZZ1a6//PLL2r59uyRp0qRJDR53//79kqSUlJTQBQsAAADAtoqLi9WmTWg2M6zKYZhVFjYz5eXlOv300/X111+re/fuWrZsmc4991z5fD6tXLlS1113nYqKijR27FitXt3wbdMLCwvVrl07/fjjj6Y8QWi9Kncy3blzp5KSkqwOBy0M+QWzkFswE/kFs4QqtwzDUHFxsbp16+Z/21YotZqVt8jISL3++usaOXKkduzYoVGjRikuLk4+n09Hjx6VJA0aNEjLly9v1LiVT0qbNm24icAUSUlJ5BZMQ37BLOQWzER+wSyhyC0zF3RaxYYllXr06KGvvvpK8+bNU79+/eRwOORyuXTGGWdoyZIl+uyzz9SuXTurwwQAAACAGlrNylulxMRELViwQAsWLLA6FAAAAABosFa18maG6OhozZ8/P+AHdwPBILdgJvILZiG3YCbyC2axS261mg1LAAAAAMDOWHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeGui4uJieTwe9e/fXwkJCWrTpo3OPPNMPfDAAzp27JjV4cEEpaWlevPNN3XPPfdo/PjxOumkk+RwOORwOOTxeBo0xt69ezVr1iz17t1bsbGxat++vYYPH66nnnpKDdk7KCcnR9OnT1dKSopiYmLUsWNHnXfeeVq5cmWD5t+4caMmTpyoE044QdHR0eratasuvfRSvffeew3qD/Ps379fzzzzjCZOnKjTTjtN8fHxio6O1gknnKBx48bp1VdfrXeMYO9LVucnzLFx40YtWLBAv//979WnTx+53W65XC653W4NHTpU9957rw4cOFDnGFbnBvcue/nb3/7m//nocDjqbMt9C7VJT0+vlke1fb377ru1jmH1vef999/XpZdeqq5du/p/pk+cOFEbN25sUP+ADDTajh07jB49ehiSDElGXFycER0d7f9+0KBBxoEDB6wOEyH2/vvv+5/jX37Nnz+/3v4bNmww3G63v09CQoIRGRnp//68884zysrKau3/xhtvGHFxcf72SUlJhtPp9H8/depUw+fz1dr/ySefrDZfmzZtDIfD0ajHAPNUfW4kGTExMUZ8fHy1c2PHjjUOHz4csH+w9yWr8xPm+fOf/1wjtxITE6ud69Chg7F27dqA/a3ODe5d9rJlyxYjJiamWn7VhvsW6vLMM88Ykgyn02l07ty51q8PP/wwYH+r7z3z58/3t3U4HEabNm3830dGRhpPPvlkk/5dKN4ayev1Gv379zckGV27djWysrIMwzCMiooK48UXX/T/QLzgggssjhSh9v777xvt2rUzzj33XGPOnDnGihUrjC5dujToP3BhYaG/bZ8+fYz169cbhmEYZWVlxiOPPGK4XC5DknHjjTcG7L99+3b/L/JDhw41tm7dahiGYRQXFxvz5s3z3wwWLVoUsP/atWuNiIgIQ5Ixbtw4Y+fOnYZhGEZBQYExffp0f/+XXnqpif86CJYkY8iQIcZjjz1m5OTk+M/n5uYa1157rf85mjhxYo2+wd6XrM5PmGvZsmXG4sWLjU8//dQ4ePCg/3xxcbGxbNkyo2PHjoYko1OnTkZhYWG1vlbnBvcue6moqDDOOeccQ5Jx9tln11m8cd9CfSqLt5NOOqnRfa2+97z00kv+NtOnTzcKCgoMwzCMnTt3GuPGjTMkGREREbX+0awuFG+N9NRTT/mfjED/4C+88IL/+rvvvmtBhDBLeXl5jXMnnXRSg4q3u+66y5BkxMbGGtu3b69x/b777vP/R668wVQ1ceJEQ5LRpUuXar98Vbrhhhv8f1UK9FfKYcOGGZKM/v37G8eOHatx/bzzzjMkGT169Aj4OGG+9957r87rVX9Y/Pjjj9WuBXtfsjo/Ya23337bnx/PP/98tWtW5wb3Lnt5+OGHDUnG1VdfXW3VIRDuW6hPMMWblfee8vJy/++H559/fo2+ZWVlRr9+/QxJxrBhwxr92CjeGmn48OGGJGPkyJEBr/t8PiMlJcWQZEyaNCnM0SHcGlq8nXjiif4l+kCKi4uNhIQEQ5Ixb968atdKSkqM2NhYQ5KxYMGCgP1zc3P9P+T+8Y9/VLuWk5Pjv7Zs2bKA/desWeNvU18RAWt8/vnn/ufolVdeqXYt2PuSlfkJ6x06dMj//Pztb3+rdo17FxqqcqXD7XYb+/btq7d4476F+jS1eLP63pOdne2/9sEHHwTsn56e7m8T6I8PdWHDkkYoLS3VJ598IkkaO3ZswDYOh0Pnn3++JOmdd94JW2xovrZu3aoff/xRUu15k5CQoOHDh0uqmTcff/yxjhw5Umf/Hj166NRTTw3YPysry39cmZu/NGzYMCUmJgbsj+YhJibGf1xRUeE/Dva+ZHV+wnofffSR/zg1NdV/bHVucO+yl+uvv16HDx/Wgw8+qI4dO9bZlvsWzGT1vaeyf2JiooYOHRqwf9W4GptfFG+N8N1338nn80mS+vXrV2u7ymt79uypdwcvtHzffPON/7ghefPtt98G1X/z5s0B+3fq1EmdOnUK2DciIkJ9+vQJ2B/Nw5o1a/zH/fv39x8He1+yOj9hjbKyMu3YsUOPPPKIrrnmGklSz549dfHFF/vbWJ0b3Lvs48knn1R2drZGjRqlSZMm1due+xYa4+eff9YZZ5yhhIQExcbG6uSTT9bEiROr/Vysyup7T2X/U089VREREQH7d+rUyf9HjsbmF8VbI+zevdt/3L1791rbVb1WtQ9ap8bmTVFRkUpKSmr0b9eunWJjY+vt/8ucq/y+rrnr6g/rFRYWauHChZKk4cOHq3fv3v5rwd6XrM5PhFdMTIwcDodiYmKUkpKim2++WQcPHtTQoUOVnZ2t6Ohof1urc4N7lz3k5+drzpw5io2N1dKlSxvUh/sWGqO0tFQbN25UVFSUfD6fcnNztXz5co0cOVLTpk1TeXl5tfZW33vMvndRvDVCcXGx/zguLq7WdlWvVe2D1inYvKk8rqtv1eu/zLlg+8NaPp9P11xzjX766SfFxMTokUceqXY9VPkVbH/yyx66dOmizp07Kz4+3n9u5MiRevjhh3XiiSdWa2t1bpBb9jB9+nQdOnRIHo9HJ598coP6cN9CQ3Tr1k3z58/Xv/71Lx09elQHDhzwv+R21KhRkqRnnnlGt956a7V+Vt97zM4vijcAaMZuueUWrVq1SpL06KOPasCAARZHBDvbsWOH9uzZo5KSEu3du1dLlizRl19+qSFDhmjevHlWhwebef755/XGG2/oV7/6lWbOnGl1OGhhxowZI4/HowEDBvhfFRAREaFzzjlHb7/9ti655BJJ0mOPPabvv//eylDDiuKtESrfmCgdX8KtTdVrVfugdQo2byqP6+pb9fovcy7Y/rDO7Nmz/SttDz30kKZNm1ajTajyK9j+5Jf9dOrUSbNmzdJbb70lh8Ohv/71r/4/FEjW5wa51bzt3btXM2bMUEREhJ588klFRkY2uC/3LQTL6XRqyZIlko6/QiUzM9N/zep7j9n5RfHWCN26dfMf5+fn19qu6rWqfdA6NTZvkpKSlJCQUKP/wYMH/bsn1dX/lzlX+X1dc9fVH9a47bbb9MADD0iSlixZohkzZgRsF+x9yer8hPWGDBmiYcOGSZKeeOIJ/3mrc4N7V/N2++23a//+/brhhhvUp08flZSUVPs6duyYv+0vz3HfQij07NlTHTp0kCRt377df97qe4/Z9y6Kt0Y49dRT5XQe/yerupPNL1Ve69Kli9q3bx+W2NB8Vd3pqCF5c9pppwXVv2/fvgH779u3Tz///HPAvhUVFdqyZUvA/gi/OXPmaPHixZKk+++/X7Nmzaq1bbD3JavzE81D5Rvnf/jhB/85q3ODe1fzlpubK0l6/PHHlZiYWOOrcpMlSf5zt912myTuWzCX1feeyv7fffddtY/2qarq2I3NL4q3RoiLi/N/XsNbb70VsI1hGHr77bclHX+tLtCrVy//RgC15c3hw4f9n7X0y7wZNmyYf7ek2vrn5eXpu+++C9h/9OjR/uPa+n/yySf+N8ySt9aaPXu2/6Ug999/v+bMmVNn+2DvS1bnJ5qHyr9aV335jtW5wb2r5eK+hVDIyclRQUGBJCklJcV/3up7T2X/4uJirV27NmD/quM2Or8a9ZHeMJ566ilDkuFwOIzPPvusxvWXXnrJ/4np7777rgURIpxOOukkQ5Ixf/78OtvdddddhiQjLi7OyM3NrXF90aJFhiQjIiLC2Lp1a43rEydONCQZXbt2NQoLC2tcv/HGGw1JRmJionHgwIEa14cNG2ZIMgYOHGgcO3asxvWxY8cakoyTTjrJKC8vr/OxwDyzZs3y3z+WLFnS4H7B3peszk+Yp7y83PD5fHW2effddw2Hw2FIMm677bZq16zODe5d9jV//nz/fScQ7luoS333LZ/PZ1x66aWGJMPpdBpbtmypdt3Ke095ebn/98MLLrigRt9jx44ZAwYMMCQZw4YNq/NxBkLx1kher9fo37+/Icno3r27/4ZSUVFh/POf/zSSkpIMScbYsWMtjhRmOHDggPHzzz/7v5KTkw1Jxpw5c6qdLy4urtavsLDQ6NKliyHJOO2004wNGzYYhmEYZWVlxmOPPWZERUUZkowbb7wx4Lzbt2834uPjDUnG8OHDjW3bthmGYRglJSXGggUL/L94LVq0KGD/Tz75xIiIiDAkGePHjzd27dplGIZh7N+/338Dk2S89NJLofqnQiPNmTPH/zw8+OCDjeob7H3J6vyEeXJzc42BAwca//u//2vk5ORU+4Xoxx9/NBYuXOh/7tq3b2/89NNP1fpbnRvcu+yrvuKN+xbqkpuba5x55pk17l0VFRXGp59+apx33nn+/Ar0HFt976n6x4cbb7zR2L9/v2EYhrFr1y5j/Pjx/j8srF27ttH/NhRvTZCbm2v06NHD/6TExcUZMTEx/u8HDRrEX2laqMq/pNT3NXny5Bp9N2zYYLjdbn+bxMREw+Vy+b8fM2aMcfTo0VrnfuONN4y4uDh/+zZt2vhvLJKMqVOn1vmXqieffNKIjIz0t2/btq3/5tWQ1UOYJy8vz/88OJ1Oo3PnznV+LV68uMYYwd6XrM5PmCM3N7favSkqKsro0KGD/5eayq+UlBRj48aNAcewOje4d9lTfcWbYXDfQu1+ee+Kjo42OnToYERHR1c7P3XqVMPr9QYcw+p7T9X/Aw6Hw2jbtq3/+8jISOPJJ59s0r8NxVsTFRUVGfPmzTP69etnxMfHG4mJicYZZ5xhLFmyxCgrK7M6PJgkmOLNMAxjz549xq233mqccsopRkxMjNG2bVtj2LBhxpNPPmlUVFTUO/8PP/xgXH/99UaPHj38N7LRo0cbGRkZDYr/iy++MK666iqje/fuRlRUlNG5c2dj3LhxRnZ2dmP+GRBiv/whVd9XbT8wgr0vWZ2fCL2ysjLj5ZdfNv785z8bgwcPNrp162ZERUUZsbGxxoknnmhcfPHFxlNPPWWUlpbWOY7VucG9y34aUrwZBvctBFZaWmqkpaUZV111lXHaaacZHTt2NCIjI42EhASjT58+xrRp04yPP/643nGsvvdkZ2cb48aNMzp37mxERUUZ3bt3N6666ir/SnFTOAzDMAQAAAAAaNbYbRIAAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbCDS6gAAAGiMiooKrVy5UqtWrdJnn32mffv2qbS0VG3btlWvXr00fPhwXX311erXr5/VoYbEl19+qf/7v/9T27ZtNWPGDKvDAQBYyGEYhmF1EAAANMRnn32myZMna9u2bf5zLpdLiYmJKiwslM/n858fP368VqxYoaioKCtCDZn09HRNnTpVJ510knbs2GF1OAAAC/GySQCALWRmZmrEiBHatm2b3G63Fi5cqG3btunYsWPav3+/jh07pvXr1+v2229XUlKSXnnlFZWWllodNgAAIcPLJgEAzd7333+viRMnqqysTKeddprefvttnXDCCdXaREREaPDgwRo8eLDmzJmjadOmWRQtAADmYOUNANDs3XXXXSoqKlJMTIxeffXVGoXbL7Vv317/93//pzZt2lQ7v2fPHs2ZM0d9+/ZVfHy84uPj1bdvX912223au3dvwLHWrFkjh8Mhh8NR55yVbdasWVNn/x9++EHTpk1TcnKyoqOjdcIJJ+j6669Xfn5+wDGnTp0qScrLy/OPU/nl8XiqtX/77bc1fvx4nXDCCYqKilJSUpJOPvlkjRkzRkuWLNGBAwfqfAwAgOaNlTcAQLO2d+9eZWRkSJKuvvpq9erVq8F9qxZcH3zwgcaNG6fCwkJJUnx8vCTp22+/1bfffqunnnpKr7/+uoYNGxa64H/h/fff1+9//3uVlJQoMTFRPp9P+fn5euqpp7R69Wp9/vnn6t69u799586ddeTIERUVFcnpdKpjx47VxktISPAf33333Zo/f77/+7i4OBmGodzcXOXm5iorK0uDBw/WiBEjTHt8AABzsfIGAGjW3n//ff9GJJdeemmTxti5c6e/cDvttNP08ccfq6SkRCUlJfrwww/Vu3dvHTx4UJdccknAFbBQueyyy/S73/1O3333nYqKinT48GG99NJLSkxM1O7duzV37txq7ffs2aP/+Z//kSQlJydrz5491b5mz54t6fiq3IIFCyRJM2fOVH5+vg4fPqzi4mIVFhbqo48+0p/+9CclJiaa9tgAAOajeAMANGubN2/2Hw8aNKhJY9x3330qLCxUu3btlJ2draFDh/qvDR8+XO+++66SkpJ04MABLVy4MOiYa/OrX/1Kr776qvr06SNJioqK0n/913/p3nvvlSRlZGSovLy80eOuW7dOPp9PvXr10gMPPKBu3br5r7Vp00bDhg3To48+qjPOOCM0DwQAYAmKNwBAs7Z//37/cfv27Rvd3zAM/fOf/5Qk/fGPf1SXLl1qtDnhhBP0xz/+UZL04osvNjHS+t1xxx1yOmv+6L3kkkskSUeOHNH333/f6HHbtm0rSSouLtbhw4eDihEA0HxRvAEAWrTc3Fz/Rh2jRo2qtd3o0aMlHS8Wc3NzTYnlrLPOCni+6kpZUzYVGTJkiDp06KCffvpJZ511lh555BFt2bJFfJQrALQsFG8AgGbN7Xb7j5tS2Ozbt89/XHUzkF+quoNl1T6hVNt7ziIj/7N/mNfrbfS4bdu21YoVK9SxY0dt3rxZN998s0499VS1a9dOv//97/X88883aVwAQPNC8QYAaNb69u3rP960aZOFkTRvo0aNUm5urp599llNnjxZp5xyig4dOqTMzExdc801GjRokKmbsQAAzEfxBgBo1kaOHOl/n9irr77a6P6dOnXyH+/atavWdlWvVe1TdVXs6NGjAfseOnSo0XGZIT4+Xtdcc43S09O1bds27dq1S4sWLVJMTIx/RQ4AYF8UbwCAZq1z58667LLLJEkvvPCCtm3b1uC+hmEoJSXFv9FJdnZ2rW3fffddScdfppmSkuI/365dO//xzp07A/Zdt25dg2NqrMrCtSnvX+vevbtuu+02zZo1S5KUlZUV0tgAAOFF8QYAaPbuueceJSQk6MiRIxo/fny9L/87ePCgLrvsMh06dEgOh0NXXHGFJGnp0qXas2dPjfa7d+/W0qVLJUl/+MMfql3r1auXYmNjJUkrV66s0dfn85n68QJJSUmS5P9w8UDKysrqHKMy/kA7XQIA7IO7OACg2evVq5eee+45RUVFafPmzfrVr36lRYsW6YcffvC3qaio0KZNmzRv3jydfPLJeuWVV/zX7rjjDrVt21YHDhzQqFGjtHbtWv+1Tz75RKNGjVJhYaHat2+v22+/vdrcLpfLv/J333336Z///KeOHTsmSdq6dasuvfRSffXVV6Y99n79+kmSioqK/B958EuLFi3S2LFj9dxzz1V7+WdZWZn++c9/avHixZKkCy+80LQ4AQDmcxjsIwwAsIlPPvlEU6ZMqVa0RUVFKSEhQYWFhfL5fJIkh8OhK6+8UsuWLZPL5ZIkffDBB7rkkkv870+Lj4+XJP/norVt21avv/66hg8fXmPeXbt26ayzztLu3bslHS/oYmNjVVRUpMTERGVmZmrEiBGSpPfff99/LElr1qzRyJEjJdX90keHwxGwv3R8M5LKl3wmJib6XwY6Y8YMzZgxQx6PRwsWLPC3j42NVWxsrA4ePOif89RTT9V7770X8HPuAAD2wMobAMA2hg4dqi1btmjFihW6+uqr1bNnT8XExKi4uFjt27fXsGHDdOedd+q7777TCy+84C/cJOm3v/2tvvvuO82aNUunnnqqfD6fDMPQqaeeqtmzZ+u7774LWLhJxz9GYN26dbruuuv8HzeQkJCgSZMmaePGjfrtb39r6uPOyMjQrbfeql69esnr9SovL095eXn+l1LecMMNeuKJJ/SHP/xB/fr1U1xcnIqKitSuXTsNHz5cDz/8sDZu3EjhBgA2x8obAAAAANgAK28AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgA/8f68Y+ceePqbAAAAAASUVORK5CYII=",
+      "text/plain": [
+       "<Figure size 1000x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "# 전체 글꼴 크기 설정\n",
+    "plt.rcParams.update({'font.size': 18})\n",
+    "\n",
+    "# CSV 파일 읽기\n",
+    "df = pd.read_csv('raw_data.csv')\n",
+    "\n",
+    "# ships_idx 별 전체 갯수 계산\n",
+    "total_counts = df['ships_idx'].value_counts().sort_index()\n",
+    "\n",
+    "# ships_idx 별 MDM=True 인 갯수 계산\n",
+    "mdm_true_counts = df[df['MDM'] == True]['ships_idx'].value_counts().sort_index()\n",
+    "\n",
+    "# 데이터프레임으로 합치기\n",
+    "summary_df = pd.DataFrame({\n",
+    "    'SD': total_counts,\n",
+    "    'PD': mdm_true_counts\n",
+    "}).fillna(0)  # NaN 값을 0으로 대체\n",
+    "\n",
+    "# 시각화\n",
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "\n",
+    "# Total Counts 먼저 그리기\n",
+    "summary_df['SD'].plot(kind='barh', ax=ax, color='orange', alpha=0.5, label='SD')\n",
+    "\n",
+    "# MDM=True Counts를 그 위에 겹쳐서 그리기\n",
+    "summary_df['PD'].plot(kind='barh', ax=ax, color='blue', alpha=0.7, label='PD')\n",
+    "\n",
+    "# y축 라벨 설정 (5 단위로만 표시)\n",
+    "y_labels = ax.get_yticks()\n",
+    "ax.set_yticks(np.arange(min(y_labels), max(y_labels)+1, 5))\n",
+    "ax.set_yticklabels([int(label) for label in np.arange(min(y_labels), max(y_labels)+1, 5)])\n",
+    "\n",
+    "# 그리드 추가\n",
+    "ax.grid(True)\n",
+    "\n",
+    "# 범례와 제목 설정\n",
+    "plt.legend(prop={'size': 18})  # 레전드 글꼴 크기 설정\n",
+    "plt.xlabel('Counts')\n",
+    "plt.ylabel('Ships')\n",
+    "plt.show()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_preprocess/no_preprocess/copy_raw_data.py b/data_preprocess/no_preprocess/copy_raw_data.py
new file mode 100644
index 0000000..c4d60da
--- /dev/null
+++ b/data_preprocess/no_preprocess/copy_raw_data.py
@@ -0,0 +1,9 @@
+import shutil
+
+source_file = 'data_import/raw_data.csv'
+
+destination_file = 'data_preprocess/preprocessed_data.csv'
+
+shutil.copy(source_file, destination_file)
+
+print(f"File copied from {source_file} to {destination_file}")
diff --git a/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb b/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb
new file mode 100644
index 0000000..9b021a1
--- /dev/null
+++ b/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb
@@ -0,0 +1,133 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Changes made in ships_idx 1000: 251\n",
+      "Changes made in ships_idx 1001: 54\n",
+      "Changes made in ships_idx 1002: 46\n",
+      "Changes made in ships_idx 1003: 162\n",
+      "Changes made in ships_idx 1004: 8\n",
+      "Changes made in ships_idx 1005: 18\n",
+      "Changes made in ships_idx 1008: 22\n",
+      "Changes made in ships_idx 1009: 5\n",
+      "Changes made in ships_idx 1010: 135\n",
+      "Changes made in ships_idx 1011: 46\n",
+      "Changes made in ships_idx 1012: 2\n",
+      "Changes made in ships_idx 1013: 130\n",
+      "Changes made in ships_idx 1014: 46\n",
+      "Changes made in ships_idx 1015: 147\n",
+      "Changes made in ships_idx 1016: 191\n",
+      "Changes made in ships_idx 1017: 111\n",
+      "Changes made in ships_idx 1018: 682\n",
+      "Changes made in ships_idx 1019: 2\n",
+      "Changes made in ships_idx 1020: 10\n",
+      "Changes made in ships_idx 1021: 2\n",
+      "Changes made in ships_idx 1022: 7\n",
+      "Changes made in ships_idx 1023: 7\n",
+      "Changes made in ships_idx 1024: 136\n",
+      "Changes made in ships_idx 1025: 10\n",
+      "Changes made in ships_idx 1026: 6\n",
+      "Changes made in ships_idx 1027: 6\n",
+      "Changes made in ships_idx 1028: 6\n",
+      "Changes made in ships_idx 1029: 132\n",
+      "Changes made in ships_idx 1030: 86\n",
+      "Changes made in ships_idx 1031: 55\n",
+      "Changes made in ships_idx 1032: 225\n",
+      "Changes made in ships_idx 1033: 147\n",
+      "Changes made in ships_idx 1035: 132\n",
+      "Changes made in ships_idx 1036: 12\n",
+      "Changes made in ships_idx 1037: 3\n",
+      "Changes made in ships_idx 1038: 8\n",
+      "Changes made in ships_idx 1039: 232\n",
+      "Changes made in ships_idx 1042: 20\n",
+      "Changes made in ships_idx 1043: 154\n",
+      "Changes made in ships_idx 1044: 121\n",
+      "Changes made in ships_idx 1045: 255\n",
+      "Changes made in ships_idx 1046: 6\n",
+      "Changes made in ships_idx 1047: 12\n",
+      "Changes made in ships_idx 1048: 82\n",
+      "Changes made in ships_idx 1049: 912\n",
+      "Changes made in ships_idx 1050: 46\n",
+      "Changes made in ships_idx 1051: 63\n",
+      "Total number of changes made: 4951\n",
+      "Updated data saved to raw_data_add_tag.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the preprocessed data CSV file\n",
+    "file_path = '../../data_import/raw_data.csv'  # Adjust this path to your actual file location\n",
+    "data = pd.read_csv(file_path, dtype=str)\n",
+    "\n",
+    "# Initialize a counter for the total number of changes\n",
+    "total_changes = 0\n",
+    "\n",
+    "# Initialize a dictionary to count changes per ships_idx\n",
+    "ships_idx_changes = {}\n",
+    "\n",
+    "# Process each group by ships_idx\n",
+    "for ships_idx, group in data.groupby('ships_idx'):\n",
+    "    # Find duplicated tag_descriptions within the group\n",
+    "    duplicated_descriptions = group['tag_description'].duplicated(keep=False)\n",
+    "    \n",
+    "    # Count how many tag_descriptions are duplicated within this ships_idx\n",
+    "    num_changes = duplicated_descriptions.sum()\n",
+    "\n",
+    "    # If there are any duplicates\n",
+    "    if num_changes > 0:\n",
+    "        # Increment the total changes count\n",
+    "        total_changes += num_changes\n",
+    "        \n",
+    "        # Record the number of changes for this ships_idx\n",
+    "        ships_idx_changes[ships_idx] = num_changes\n",
+    "\n",
+    "        # Apply the concatenation of tag_name to tag_description for duplicates\n",
+    "        data.loc[duplicated_descriptions & (data['ships_idx'] == ships_idx), 'tag_description'] = \\\n",
+    "            data['tag_name'] + ' ' + data['tag_description']\n",
+    "\n",
+    "# Output the changes per ships_idx\n",
+    "for ships_idx, count in ships_idx_changes.items():\n",
+    "    print(f\"Changes made in ships_idx {ships_idx}: {count}\")\n",
+    "\n",
+    "# Output the total number of changes\n",
+    "print(f\"Total number of changes made: {total_changes}\")\n",
+    "\n",
+    "# Optionally, save the updated DataFrame back to a CSV\n",
+    "output_file_path = 'raw_data_add_tag.csv'\n",
+    "data.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "print(f\"Updated data saved to {output_file_path}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_preprocess/rule_base_replacement/2.seperate_number.ipynb b/data_preprocess/rule_base_replacement/2.seperate_number.ipynb
new file mode 100644
index 0000000..1f8fce1
--- /dev/null
+++ b/data_preprocess/rule_base_replacement/2.seperate_number.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Updated data saved to raw_data_s.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "\n",
+    "# Load the data_mapping CSV file\n",
+    "data_mapping_file_path = '../../data_import/raw_data.csv'  # Adjust this path to your actual file location\n",
+    "# data_mapping_file_path = 'raw_data_add_tag.csv'  # Adjust this path to your actual file location\n",
+    "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
+    "\n",
+    "# Backup the original tag_description\n",
+    "data_mapping['org_tag_description'] = data_mapping['tag_description']\n",
+    "\n",
+    "# Ensure all values in the 'tag_description' column are strings\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n",
+    "\n",
+    "# Function to find tokens containing numbers\n",
+    "def find_tokens_with_numbers(description):\n",
+    "    tokens = description.split()  # Tokenize by spaces\n",
+    "    number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n",
+    "    return number_tokens\n",
+    "\n",
+    "# Function to process tokens\n",
+    "def process_token(token):\n",
+    "    # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n",
+    "    token = re.sub(r'(_|-)(?=\\d)', ' ', token)\n",
+    "    token = re.sub(r'(?<=\\d)(_|-)', ' ', token)\n",
+    "\n",
+    "    # Step 2: Insert spaces between letters and numbers where no separator exists\n",
+    "    token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n",
+    "    token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n",
+    "\n",
+    "    # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n",
+    "    token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n",
+    "\n",
+    "    # Clean multiple spaces and strip\n",
+    "    token = re.sub(r'\\s+', ' ', token).strip()\n",
+    "    return token\n",
+    "\n",
+    "# Apply the process to each row in the 'tag_description' column\n",
+    "for index, row in data_mapping.iterrows():\n",
+    "    original_description = row['tag_description']\n",
+    "    number_tokens = find_tokens_with_numbers(original_description)\n",
+    "\n",
+    "    # Process each token containing numbers\n",
+    "    processed_tokens = [process_token(token) for token in number_tokens]\n",
+    "\n",
+    "    # Replace the original tokens with processed tokens in the tag_description\n",
+    "    new_description = original_description\n",
+    "    for original_token, processed_token in zip(number_tokens, processed_tokens):\n",
+    "        new_description = new_description.replace(original_token, processed_token)\n",
+    "\n",
+    "    # Update the data_mapping with the modified description\n",
+    "    data_mapping.at[index, 'tag_description'] = new_description\n",
+    "\n",
+    "# Save the updated data_mapping to a new CSV file\n",
+    "output_file_path = 'raw_data_s.csv'\n",
+    "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "print(f\"Updated data saved to {output_file_path}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_preprocess/rule_base_replacement/3.replacement.ipynb b/data_preprocess/rule_base_replacement/3.replacement.ipynb
new file mode 100644
index 0000000..8aa43bf
--- /dev/null
+++ b/data_preprocess/rule_base_replacement/3.replacement.ipynb
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Updated data saved to ../preprocessed_data.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "\n",
+    "# Load the data_mapping CSV file\n",
+    "data_mapping_file_path = 'raw_data_s.csv'  # Adjust this path to your actual file location\n",
+    "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
+    "    \n",
+    "    # Ensure all values in the 'tag_description' column are strings\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n",
+    "\n",
+    "# Initial replacement mapping\n",
+    "initial_replacements = {\n",
+    "    \"MGE\": \"G/E\",\n",
+    "    \"GEN.\": \"G/E\",\n",
+    "    \"GEN\": \"G/E\",\n",
+    "    \"GE\": \"G/E\",\n",
+    "    \"G_E\": \"G/E\",\n",
+    "    \"ME\": \"M/E\",\n",
+    "    \"M_E\": \"M/E\",\n",
+    "    \"S_G\": \"S/G\",\n",
+    "    \"T_C\": \"T/C\",\n",
+    "    \"TC\": \"T/C\",\n",
+    "    \"L_O\": \"L.O\",\n",
+    "    \"LO\": \"L.O\",\n",
+    "    \"F_O\": \"F.O\",\n",
+    "    \"FO\": \"F.O\",\n",
+    "    \"D_G\": \"D/G\",\n",
+    "    \"DG\": \"D/G\",\n",
+    "    \"PP\": \"P/P\"\n",
+    "}\n",
+    "\n",
+    "# Second replacement mapping\n",
+    "second_replacements = {\n",
+    "    \"_G/E\": \" G/E\",\n",
+    "    \"G/E_\": \"G/E \",\n",
+    "    \"_M/E\": \" M/E\",\n",
+    "    \"M/E_\": \"M/E \",\n",
+    "    \"_S/G\": \" S/G\",\n",
+    "    \"S/G_\": \"S/G \",\n",
+    "    \"_T/C\": \" T/C\",\n",
+    "    \"T/C_\": \"T/C \",\n",
+    "    \"_L.O\": \" L.O\",\n",
+    "    \"L.O_\": \"L.O \",\n",
+    "    \"_F.O\": \" F.O\",\n",
+    "    \"F.O_\": \"F.O \",\n",
+    "    \"_D/G\": \" D/G\",\n",
+    "    \"D/G_\": \"D/G \",\n",
+    "    \"DG_\": \"DG \"\n",
+    "}\n",
+    "\n",
+    "# Function to separate numbers from text in a token\n",
+    "def separate_numbers_from_text(description):\n",
+    "    # This regex pattern finds occurrences where text is followed by numbers or vice versa\n",
+    "    return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n",
+    "\n",
+    "# Function to perform replacements using tokens\n",
+    "def replace_tokens(description, replacements):\n",
+    "    tokens = description.split()  # Tokenize by spaces\n",
+    "    tokens = [replacements.get(token, token) for token in tokens]  # Replace based on the dictionary\n",
+    "    return ' '.join(tokens)\n",
+    "\n",
+    "# Function to perform replacements for substrings\n",
+    "def replace_substrings(description, replacements):\n",
+    "    for old, new in replacements.items():\n",
+    "        description = description.replace(old, new)\n",
+    "    return description\n",
+    "\n",
+    "# Separate numbers from text before applying replacements\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n",
+    "\n",
+    "# Apply initial replacements\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n",
+    "\n",
+    "# Apply second replacements as substrings\n",
+    "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n",
+    "\n",
+    "# Save the updated data_mapping to a new CSV file\n",
+    "output_file_path = '../preprocessed_data.csv'\n",
+    "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "print(f\"Updated data saved to {output_file_path}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_preprocess/split_data.ipynb b/data_preprocess/split_data.ipynb
new file mode 100644
index 0000000..11fd087
--- /dev/null
+++ b/data_preprocess/split_data.ipynb
@@ -0,0 +1,441 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final Group Allocation:\n",
+      "Group 1: Ships_idx = [1003, 1028, 1049, 1044, 1020, 1041, 1045, 1036, 1005, 1006], PD type = 537, PD = 2006, SD = 14719\n",
+      "Group 2: Ships_idx = [1025, 1035, 1021, 1026, 1002, 1030, 1024, 1037, 1038, 1029], PD type = 537, PD = 1958, SD = 8173\n",
+      "Group 3: Ships_idx = [1016, 1046, 1031, 1009, 1048, 1043, 1042, 1019, 1018, 1007, 1000], PD type = 534, PD = 2079, SD = 15310\n",
+      "Group 4: Ships_idx = [1004, 1032, 1039, 1014, 1040, 1017, 1022, 1051, 1008, 1050, 1013], PD type = 532, PD = 2066, SD = 12882\n",
+      "Group 5: Ships_idx = [1047, 1015, 1027, 1010, 1011, 1001, 1034, 1023, 1012, 1033], PD type = 531, PD = 2064, SD = 10988\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "# Function to calculate the number of unique combinations and total count for each ship\n",
+    "def calculate_ship_count(group):\n",
+    "    ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index()\n",
+    "    ship_count.columns = ['ships_idx', 'comb_count', 'total_count']\n",
+    "    return ship_count\n",
+    "\n",
+    "# Function to calculate the combination count and total count for a group\n",
+    "def calculate_group_count(group):\n",
+    "    comb_count = group['thing_property'].nunique()\n",
+    "    total_count = group['thing_property'].size\n",
+    "    return comb_count, total_count\n",
+    "\n",
+    "# Function to calculate the increase in combination count when a ship is added to a group\n",
+    "def calculate_comb_count_increase(groups, g, ship_idx, mdm):\n",
+    "    temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n",
+    "    temp_groups[g].append(ship_idx)\n",
+    "    \n",
+    "    group_ships = temp_groups[g]\n",
+    "    group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
+    "    \n",
+    "    new_comb_count, _ = calculate_group_count(group_data)\n",
+    "    \n",
+    "    current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
+    "    current_comb_count, _ = calculate_group_count(current_group_data)\n",
+    "    \n",
+    "    increase = new_comb_count - current_comb_count\n",
+    "    \n",
+    "    return increase\n",
+    "\n",
+    "# Function to calculate the increase in total count when a ship is added to a group\n",
+    "def calculate_total_count_increase(groups, g, ship_idx, mdm):\n",
+    "    temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n",
+    "    temp_groups[g].append(ship_idx)\n",
+    "    \n",
+    "    group_ships = temp_groups[g]\n",
+    "    group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
+    "    \n",
+    "    _, new_total_count = calculate_group_count(group_data)\n",
+    "    \n",
+    "    current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
+    "    _, current_total_count = calculate_group_count(current_group_data)\n",
+    "    \n",
+    "    increase = new_total_count - current_total_count\n",
+    "    \n",
+    "    return increase\n",
+    "\n",
+    "# Function to find the ship that will bring the total count closest to the target\n",
+    "def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count):\n",
+    "    total_count_differences = []\n",
+    "\n",
+    "    current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
+    "    _, current_total_count = calculate_group_count(current_group_data)\n",
+    "\n",
+    "    for ship_idx in remaining_ships:\n",
+    "        increase = calculate_total_count_increase(groups, g, ship_idx, mdm)\n",
+    "        new_total_count = current_total_count + increase\n",
+    "        difference = abs(target_total_count - new_total_count)\n",
+    "        total_count_differences.append((ship_idx, difference, increase))\n",
+    "\n",
+    "    if not total_count_differences:\n",
+    "        return None, 0\n",
+    "    \n",
+    "    closest_ship = min(total_count_differences, key=lambda x: x[1])\n",
+    "    selected_ship_idx, _, selected_increase = closest_ship\n",
+    "\n",
+    "    return selected_ship_idx, selected_increase\n",
+    "\n",
+    "# Function to find the ship that gives the maximum increase in combination count\n",
+    "def find_max_increase_ship(groups, g, remaining_ships, mdm):\n",
+    "    comb_count_increase = []\n",
+    "\n",
+    "    for ship_idx in remaining_ships:\n",
+    "        increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n",
+    "        comb_count_increase.append((ship_idx, increase))\n",
+    "\n",
+    "    max_increase_ship = max(comb_count_increase, key=lambda x: x[1])\n",
+    "    selected_ship_idx, max_increase = max_increase_ship\n",
+    "    \n",
+    "    return selected_ship_idx, max_increase\n",
+    "\n",
+    "# Function to find the ship that will bring the combination count closest to the target\n",
+    "def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count):\n",
+    "    comb_count_differences = []\n",
+    "\n",
+    "    current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
+    "    current_comb_count, _ = calculate_group_count(current_group_data)\n",
+    "\n",
+    "    for ship_idx in remaining_ships:\n",
+    "        increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n",
+    "        new_comb_count = current_comb_count + increase\n",
+    "        difference = abs(target_comb_count - new_comb_count)\n",
+    "        comb_count_differences.append((ship_idx, difference, increase))\n",
+    "\n",
+    "    if not comb_count_differences:\n",
+    "        return None, 0\n",
+    "\n",
+    "    closest_ship = min(comb_count_differences, key=lambda x: x[1])\n",
+    "    selected_ship_idx, _, selected_increase = closest_ship\n",
+    "\n",
+    "    return selected_ship_idx, selected_increase\n",
+    "\n",
+    "# Function to find the group with the maximum combination count\n",
+    "def find_group_with_max_comb_count(groups, mdm):\n",
+    "    max_comb_count = -1\n",
+    "    max_group_idx = -1\n",
+    "\n",
+    "    for g in range(len(groups)):\n",
+    "        group_ships = groups[g]\n",
+    "        group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
+    "        comb_count, _ = calculate_group_count(group_data)\n",
+    "        \n",
+    "        if comb_count > max_comb_count:\n",
+    "            max_comb_count = comb_count\n",
+    "            max_group_idx = g\n",
+    "\n",
+    "    return max_group_idx, max_comb_count\n",
+    "\n",
+    "# Function to find the group with the maximum total count\n",
+    "def find_group_with_max_total_count(groups, mdm):\n",
+    "    max_total_count = -1\n",
+    "    max_group_idx = -1\n",
+    "\n",
+    "    for g in range(len(groups)):\n",
+    "        group_ships = groups[g]\n",
+    "        group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
+    "        _, total_count = calculate_group_count(group_data)\n",
+    "        \n",
+    "        if total_count > max_total_count:\n",
+    "            max_total_count = total_count\n",
+    "            max_group_idx = g\n",
+    "\n",
+    "    return max_group_idx, max_total_count\n",
+    "\n",
+    "import pandas as pd\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "# Load the CSV file\n",
+    "data_file_path = 'preprocessed_data.csv'\n",
+    "data = pd.read_csv(data_file_path)\n",
+    "\n",
+    "# Filter the data where MDM is True\n",
+    "mdm_true = data[data['MDM'] == True].copy()  # .copy()를 사용하여 명시적으로 복사본 생성\n",
+    "mdm_all = data.copy()\n",
+    "\n",
+    "# Create a new column combining 'thing' and 'property'\n",
+    "mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property']\n",
+    "mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property']\n",
+    "\n",
+    "# Initial setup for groups\n",
+    "ship_count = calculate_ship_count(mdm_true)\n",
+    "num_groups = 5\n",
+    "groups = defaultdict(list)\n",
+    "\n",
+    "# Sort ships by combination count in descending order\n",
+    "sorted_ships = ship_count.sort_values(by='comb_count', ascending=False)\n",
+    "\n",
+    "# Assign the first 5 ships to the groups\n",
+    "for i in range(num_groups):\n",
+    "    groups[i].append(sorted_ships.iloc[i]['ships_idx'])\n",
+    "\n",
+    "remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values\n",
+    "\n",
+    "# Allocate remaining ships to the groups\n",
+    "while len(remaining_ships) > 0:\n",
+    "    group_comb_counts = []\n",
+    "    for g in range(num_groups):\n",
+    "        group_ships = groups[g]\n",
+    "        group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
+    "        comb_count, _ = calculate_group_count(group_data)\n",
+    "        group_comb_counts.append((g, comb_count))\n",
+    "\n",
+    "    group_comb_counts.sort(key=lambda x: x[1])\n",
+    "    \n",
+    "    remaining_group = []\n",
+    "    for g, _ in group_comb_counts:\n",
+    "        if len(remaining_ships) == 0:\n",
+    "            break\n",
+    "        \n",
+    "        if group_comb_counts.index((g, _)) == 0:\n",
+    "            selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)\n",
+    "                \n",
+    "        else:\n",
+    "            max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)\n",
+    "            selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)\n",
+    "\n",
+    "        if comb_increase == 0:\n",
+    "            remaining_group.append(g)\n",
+    "        else:\n",
+    "            groups[g].append(selected_ship_idx)\n",
+    "            remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n",
+    "\n",
+    "    for g in remaining_group:\n",
+    "        if len(remaining_ships) == 0:\n",
+    "            break\n",
+    "        max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true)\n",
+    "        selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count)\n",
+    "        if selected_ship_idx is not None:\n",
+    "            groups[g].append(selected_ship_idx)\n",
+    "            remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n",
+    "\n",
+    "# Calculate comb_count for each group and store it in a list\n",
+    "group_comb_counts = []\n",
+    "for g in range(num_groups):\n",
+    "    group_ships = groups[g]\n",
+    "    group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
+    "    comb_count, total_count = calculate_group_count(group_data_true)\n",
+    "\n",
+    "    # Calculate total count including MDM=False\n",
+    "    group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n",
+    "    _, total_count_all = calculate_group_count(group_data_all)\n",
+    "    \n",
+    "    group_comb_counts.append((g, comb_count, total_count_all))\n",
+    "\n",
+    "# Sort the groups by comb_count in descending order\n",
+    "group_comb_counts.sort(key=lambda x: x[1], reverse=True)\n",
+    "\n",
+    "# Reorder the groups dictionary based on the sorted order\n",
+    "sorted_groups = defaultdict(list)\n",
+    "for i, (g, _, _) in enumerate(group_comb_counts):\n",
+    "    sorted_groups[i] = groups[g]\n",
+    "\n",
+    "# Final output of group allocation\n",
+    "print(\"Final Group Allocation:\")\n",
+    "for g in range(num_groups):\n",
+    "    group_ships = sorted_groups[g]\n",
+    "    group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
+    "    comb_count, total_count = calculate_group_count(group_data_true)\n",
+    "\n",
+    "    # Calculate total count including MDM=False\n",
+    "    group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n",
+    "    _, total_count_all = calculate_group_count(group_data_all)\n",
+    "\n",
+    "    print(f\"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CSV file has been generated: 'combined_group_allocation.csv'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import GroupKFold\n",
+    "\n",
+    "# Prepare data for custom group allocation (BGKF)\n",
+    "comb_counts = []\n",
+    "total_counts = []\n",
+    "ship_counts = []\n",
+    "custom_results = []\n",
+    "\n",
+    "for g in range(num_groups):\n",
+    "    group_ships = groups[g]\n",
+    "    group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
+    "    comb_count, total_count = calculate_group_count(group_data_true)\n",
+    "    \n",
+    "    # Calculate total count including MDM=False\n",
+    "    group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n",
+    "    _, total_count_all = calculate_group_count(group_data_all)\n",
+    "    \n",
+    "    custom_results.append({\n",
+    "        'Group': g + 1,\n",
+    "        'Allocation': 'BGKF',\n",
+    "        'Comb_count': comb_count,\n",
+    "        'Total_count': total_count,\n",
+    "        'Total_count_all': total_count_all,\n",
+    "        'Ship_count': len(group_ships),\n",
+    "        'Ships_idx': list(group_ships)\n",
+    "    })\n",
+    "\n",
+    "# Sort the custom group allocation by comb_count in descending order\n",
+    "custom_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n",
+    "\n",
+    "# Adjust group numbers after sorting\n",
+    "for i, result in enumerate(custom_results):\n",
+    "    result['Group'] = i + 1\n",
+    "\n",
+    "# Prepare data for GroupKFold allocation (GKF)\n",
+    "gkf = GroupKFold(n_splits=5)\n",
+    "gkf_results = []\n",
+    "\n",
+    "for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])):\n",
+    "    test_group = mdm_true.iloc[test_idx]\n",
+    "    comb_count, total_count = calculate_group_count(test_group)\n",
+    "    \n",
+    "    # Calculate total count including MDM=False\n",
+    "    test_group_ships = test_group['ships_idx'].unique()\n",
+    "    test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)]\n",
+    "    _, total_count_all = calculate_group_count(test_group_all)\n",
+    "    \n",
+    "    gkf_results.append({\n",
+    "        'Group': i + 1,\n",
+    "        'Allocation': 'GKF',\n",
+    "        'Comb_count': comb_count,\n",
+    "        'Total_count': total_count,\n",
+    "        'Total_count_all': total_count_all,\n",
+    "        'Ship_count': test_group['ships_idx'].nunique(),\n",
+    "        'Ships_idx': list(test_group['ships_idx'].unique())\n",
+    "    })\n",
+    "\n",
+    "# Sort the GKF allocation by comb_count in descending order\n",
+    "gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n",
+    "\n",
+    "# Adjust group numbers after sorting\n",
+    "for i, result in enumerate(gkf_results):\n",
+    "    result['Group'] = i + 1\n",
+    "\n",
+    "# Combine BGKF and GKF results into one DataFrame\n",
+    "combined_results = custom_results + gkf_results\n",
+    "combined_df = pd.DataFrame(combined_results)\n",
+    "\n",
+    "# Output the combined results to a single CSV file\n",
+    "combined_df.to_csv('combined_group_allocation.csv', index=False)\n",
+    "\n",
+    "print(\"CSV file has been generated: 'combined_group_allocation.csv'\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Group 1 datasets saved in dataset/1\n",
+      "Group 2 datasets saved in dataset/2\n",
+      "Group 3 datasets saved in dataset/3\n",
+      "Group 4 datasets saved in dataset/4\n",
+      "Group 5 datasets saved in dataset/5\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import KFold\n",
+    "\n",
+    "def save_datasets_for_group(groups, mdm, data, output_dir='dataset', n_splits=4):\n",
+    "    for i in range(len(groups)):\n",
+    "        group_folder = os.path.join(output_dir, str(i + 1))\n",
+    "        os.makedirs(group_folder, exist_ok=True)\n",
+    "        \n",
+    "        # Create the test dataset by including only group i\n",
+    "        test_group_ships = groups[i]\n",
+    "        test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]\n",
+    "        \n",
+    "        # Extract corresponding entries from the external test dataset\n",
+    "        test_all_data = data[data['ships_idx'].isin(test_group_ships)]\n",
+    "        \n",
+    "        # Create the train dataset by excluding group i\n",
+    "        train_group_ships = []\n",
+    "        for g in range(len(groups)):\n",
+    "            if g != i:\n",
+    "                train_group_ships.extend(groups[g])\n",
+    "        train_data = mdm[mdm['ships_idx'].isin(train_group_ships)]\n",
+    "        \n",
+    "        # Use KFold to split train_data into train and valid datasets\n",
+    "        kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42)\n",
+    "        train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data))\n",
+    "        \n",
+    "        final_train_data = train_data.iloc[train_idx_inner]\n",
+    "        valid_data = train_data.iloc[valid_idx_inner]\n",
+    "        \n",
+    "        # Combine train and valid data to create train_all\n",
+    "        train_all_data = pd.concat([final_train_data, valid_data])\n",
+    "        \n",
+    "        # Save datasets to CSV files\n",
+    "        train_file_path = os.path.join(group_folder, 'train.csv')\n",
+    "        valid_file_path = os.path.join(group_folder, 'valid.csv')\n",
+    "        test_file_path = os.path.join(group_folder, 'test.csv')\n",
+    "        test_all_file_path = os.path.join(group_folder, 'test_all.csv')\n",
+    "        train_all_file_path = os.path.join(group_folder, 'train_all.csv')\n",
+    "        \n",
+    "        final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')\n",
+    "        valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')\n",
+    "        # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n",
+    "        test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n",
+    "        train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')\n",
+    "        \n",
+    "        print(f\"Group {i + 1} datasets saved in {group_folder}\")\n",
+    "\n",
+    "# Example usage:\n",
+    "save_datasets_for_group(groups, mdm_true, data, n_splits=4)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/evaluation/check_accuracy.ipynb b/evaluation/check_accuracy.ipynb
new file mode 100644
index 0000000..0a5773e
--- /dev/null
+++ b/evaluation/check_accuracy.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Performance for all_with_p_s.csv:\n",
+      "TP: 1724, TN: 11907, FP: 919, FN: 272\n",
+      "Precision: 0.6523, Recall: 0.8637, Accuracy: 0.9196\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Set the group number\n",
+    "group_number = 1  # Change this to the desired group number\n",
+    "\n",
+    "# File paths for the two datasets\n",
+    "test_s_path = f'../post_process/0.result/{group_number}/test_s.csv'\n",
+    "\n",
+    "# Load the CSV files\n",
+    "test_s_csv = pd.read_csv(test_s_path, low_memory=False)\n",
+    "test_s_csv.fillna('', inplace=True)\n",
+    "\n",
+    "def evaluate_performance(test_csv):\n",
+    "    # Initialize counters for TP, TN, FP, FN\n",
+    "    TP = 0\n",
+    "    TN = 0\n",
+    "    FP = 0\n",
+    "    FN = 0\n",
+    "\n",
+    "    # Iterate over the DataFrame rows\n",
+    "    for index, row in test_csv.iterrows():\n",
+    "        # True Positive (TP): s_correct is True and MDM is True\n",
+    "        if row['s_correct'] and row['MDM']:\n",
+    "            TP += 1\n",
+    "        # True Negative (TN): s_thing is null and MDM is False\n",
+    "        elif row['s_thing'] == '' and not row['MDM']:\n",
+    "            TN += 1\n",
+    "        # False Positive (FP): \n",
+    "        # 1) s_thing is not null and MDM is False \n",
+    "        # OR \n",
+    "        # 2) s_thing is not null and s_correct is False and MDM is True\n",
+    "        elif (row['s_thing'] != '' and not row['MDM']) or (row['s_thing'] != '' and not row['s_correct'] and row['MDM']):\n",
+    "            FP += 1\n",
+    "        # False Negative (FN): s_thing is null and MDM is True\n",
+    "        elif row['s_thing'] == '' and row['MDM']:\n",
+    "            FN += 1\n",
+    "\n",
+    "    # Calculate total\n",
+    "    total = TP + TN + FP + FN\n",
+    "\n",
+    "    # Calculate Precision, Recall, and Accuracy\n",
+    "    precision = TP / (TP + FP) if (TP + FP) > 0 else 0\n",
+    "    recall = TP / (TP + FN) if (TP + FN) > 0 else 0\n",
+    "    accuracy = (TP + TN) / total if total > 0 else 0\n",
+    "\n",
+    "    return TP, TN, FP, FN, precision, recall, accuracy\n",
+    "\n",
+    "# Evaluate both datasets\n",
+    "tp_s_results = evaluate_performance(test_s_csv)\n",
+    "\n",
+    "# Print the results for both datasets\n",
+    "print(\"Performance for all_with_p_s.csv:\")\n",
+    "print(f\"TP: {tp_s_results[0]}, TN: {tp_s_results[1]}, FP: {tp_s_results[2]}, FN: {tp_s_results[3]}\")\n",
+    "print(f\"Precision: {tp_s_results[4]:.4f}, Recall: {tp_s_results[5]:.4f}, Accuracy: {tp_s_results[6]:.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/post_process/tfidf_class/1.make_sdl_class_document.py b/post_process/tfidf_class/1.make_sdl_class_document.py
new file mode 100644
index 0000000..377a41c
--- /dev/null
+++ b/post_process/tfidf_class/1.make_sdl_class_document.py
@@ -0,0 +1,47 @@
+import pandas as pd
+import re
+import os
+
+# Loop through group numbers from 1 to 5
+for group_number in range(1, 6):
+    
+    # Path to the train_all file
+    train_all_path = f'data_preprocess/dataset/{group_number}/train_all.csv'
+    
+    # Read the train_all data
+    train_all_csv = pd.read_csv(train_all_path, low_memory=False)
+    
+    # Concatenate tag_description based on the combination of thing and property
+    tag_description_concatenated = train_all_csv.groupby(['thing', 'property'])['tag_description'].apply(lambda x: ' '.join(x)).reset_index()
+    
+    # Concatenate tag_name based on the combination of thing and property
+    tag_name_concatenated = train_all_csv.groupby(['thing', 'property'])['tag_name'].apply(lambda x: ' '.join(x)).reset_index()
+    
+    # Calculate mapping_count
+    mapping_count = train_all_csv.groupby(['thing', 'property']).size().reset_index(name='mapping_count')
+    
+    # Merge the three DataFrames: mapping_count, tag_description_concatenated, and tag_name_concatenated
+    thing_property_grouped = pd.merge(mapping_count, tag_description_concatenated, on=['thing', 'property'])
+    thing_property_grouped = pd.merge(thing_property_grouped, tag_name_concatenated, on=['thing', 'property'])
+    
+    # Calculate token_count by splitting tag_description using r'\S+'
+    thing_property_grouped['td_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(re.findall(r'\S+', x)))
+    
+    # Create pattern by replacing digits in 'thing' and 'property' with '#'
+    thing_property_grouped['pattern'] = thing_property_grouped['thing'].str.replace(r'\d', '#', regex=True) + " " + thing_property_grouped['property'].str.replace(r'\d', '#', regex=True) 
+    
+    # Calculate the total number of unique thing_property combinations
+    total_thing_property_count = thing_property_grouped.shape[0]
+    
+    # Specify the output path
+    output_path = f'post_process/tfidf_class/0.class_document/{group_number}/sdl_class_rdoc.csv'
+    
+    # Create the directory if it doesn't exist
+    output_dir = os.path.dirname(output_path)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Save the result to the CSV file
+    thing_property_grouped.to_csv(output_path, index=False, encoding='utf-8-sig')
+    
+    print(f"Concatenated data saved to {output_path}")
+    print(f"Total number of unique thing_property combinations: {total_thing_property_count}")
diff --git a/post_process/tfidf_class/2.classify_by_tfidf.ipynb b/post_process/tfidf_class/2.classify_by_tfidf.ipynb
new file mode 100644
index 0000000..ef2344d
--- /dev/null
+++ b/post_process/tfidf_class/2.classify_by_tfidf.ipynb
@@ -0,0 +1,134 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy (MDM=True) for Group 1: 79.41%\n",
+      "Accuracy (MDM=True) for Group 2: 79.32%\n",
+      "Accuracy (MDM=True) for Group 3: 82.49%\n",
+      "Accuracy (MDM=True) for Group 4: 85.61%\n",
+      "Accuracy (MDM=True) for Group 5: 79.72%\n",
+      "Average Accuracy (MDM=True) across all groups: 81.31%\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "\n",
+    "# Initialize a list to store the accuracies for each group\n",
+    "accuracies = []\n",
+    "\n",
+    "# Loop through group numbers from 1 to 5\n",
+    "for group_number in range(1, 6):\n",
+    "    \n",
+    "    # Load the CSV files from the specified group\n",
+    "    sdl_class_rdoc_path = f'0.class_document/{group_number}/sdl_class_rdoc.csv'\n",
+    "    test_path = f'../../data_preprocess/dataset/{group_number}/test.csv'\n",
+    "    \n",
+    "    # Check if test file exists, if not, skip this iteration\n",
+    "    if not os.path.exists(test_path):\n",
+    "        print(f\"test file for Group {group_number} does not exist. Skipping...\")\n",
+    "        continue\n",
+    "    \n",
+    "    sdl_class_rdoc_csv = pd.read_csv(sdl_class_rdoc_path, low_memory=False)\n",
+    "    test_csv = pd.read_csv(test_path, low_memory=False)\n",
+    "    \n",
+    "    # Replace NaN values with empty strings in relevant columns\n",
+    "    sdl_class_rdoc_csv['tag_description'] = sdl_class_rdoc_csv['tag_description'].fillna('')\n",
+    "    test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",
+    "    \n",
+    "    # Initialize new columns in test_csv\n",
+    "    test_csv['c_thing'] = ''\n",
+    "    test_csv['c_property'] = ''\n",
+    "    test_csv['c_score'] = ''\n",
+    "    test_csv['c_duplicate'] = 0  # Initialize c_duplicate to store duplicate counts\n",
+    "    \n",
+    "    # Combine both sdl_class_rdoc and test CSVs tag_descriptions for TF-IDF Vectorizer training\n",
+    "    combined_tag_descriptions = sdl_class_rdoc_csv['tag_description'].tolist() + test_csv['tag_description'].tolist()\n",
+    "    \n",
+    "    # Create a TF-IDF Vectorizer\n",
+    "    vectorizer = TfidfVectorizer(\n",
+    "        token_pattern=r'\\S+',\n",
+    "        ngram_range=(1, 6),  # Use ngrams from 1 to 6\n",
+    "    )\n",
+    "    \n",
+    "    # Fit the TF-IDF vectorizer on the combined tag_descriptions\n",
+    "    vectorizer.fit(combined_tag_descriptions)\n",
+    "    \n",
+    "    # Transform both sdl_class_rdoc and test CSVs into TF-IDF matrices\n",
+    "    sdl_class_rdoc_tfidf_matrix = vectorizer.transform(sdl_class_rdoc_csv['tag_description'])\n",
+    "    test_tfidf_matrix = vectorizer.transform(test_csv['tag_description'])\n",
+    "    \n",
+    "    # Calculate cosine similarity between test and class-level sdl_class_rdoc vectors\n",
+    "    similarity_matrix = cosine_similarity(test_tfidf_matrix, sdl_class_rdoc_tfidf_matrix)\n",
+    "    \n",
+    "    # Find the most similar class-level tag_description for each test description\n",
+    "    most_similar_indices = similarity_matrix.argmax(axis=1)\n",
+    "    most_similar_scores = similarity_matrix.max(axis=1)\n",
+    "    \n",
+    "    # Assign the corresponding thing, property, and similarity score to the test CSV\n",
+    "    test_csv['c_thing'] = sdl_class_rdoc_csv.iloc[most_similar_indices]['thing'].values\n",
+    "    test_csv['c_property'] = sdl_class_rdoc_csv.iloc[most_similar_indices]['property'].values\n",
+    "    test_csv['c_score'] = most_similar_scores\n",
+    "    \n",
+    "    # Check if the predicted 'c_thing' and 'c_property' match the actual 'thing' and 'property'\n",
+    "    test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",
+    "    test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",
+    "    test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",
+    "    \n",
+    "    # Calculate accuracy based only on MDM = True\n",
+    "    mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",
+    "    accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100\n",
+    "    accuracies.append(accuracy)\n",
+    "    \n",
+    "    print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n",
+    "    \n",
+    "    # Specify output file paths\n",
+    "    output_path = f'0.class_document/{group_number}/test_p_c.csv'\n",
+    "    test_csv.to_csv(output_path, index=False, encoding='utf-8-sig')\n",
+    "    \n",
+    "    # Filter for rows where MDM is True and ctp_correct is False\n",
+    "    false_positive_rows = test_csv[(test_csv['MDM'] == True) & (test_csv['ctp_correct'] == False)]\n",
+    "    \n",
+    "    # Save false positives to a separate file\n",
+    "    fp_output_path = f'0.class_document/{group_number}/fp_class.csv'\n",
+    "    false_positive_rows.to_csv(fp_output_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "# Calculate and print the average accuracy across all groups\n",
+    "average_accuracy = sum(accuracies) / len(accuracies)\n",
+    "print(f\"Average Accuracy (MDM=True) across all groups: {average_accuracy:.2f}%\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/post_process/tfidf_class/3.refine.ipynb b/post_process/tfidf_class/3.refine.ipynb
new file mode 100644
index 0000000..52ee5d0
--- /dev/null
+++ b/post_process/tfidf_class/3.refine.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'p_correct'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
+      "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'p_correct'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;66;03m# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, row \u001b[38;5;129;01min\u001b[39;00m test_csv\u001b[38;5;241m.\u001b[39miterrows():\n\u001b[0;32m---> 22\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mp_correct\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;129;01mand\u001b[39;00m row[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mctp_correct\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m     23\u001b[0m         update_count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m  \u001b[38;5;66;03m# Increment the counter\u001b[39;00m\n\u001b[1;32m     25\u001b[0m         \u001b[38;5;66;03m# Check for duplicates within the same ships_idx\u001b[39;00m\n",
+      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/series.py:1121\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   1118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m   1120\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1121\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1123\u001b[0m \u001b[38;5;66;03m# Convert generator to list before going through hashable part\u001b[39;00m\n\u001b[1;32m   1124\u001b[0m \u001b[38;5;66;03m# (We will iterate through the generator there to check for slices)\u001b[39;00m\n\u001b[1;32m   1125\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n",
+      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/series.py:1237\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m   1234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m   1236\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1237\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1239\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m   1240\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n",
+      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3807\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3808\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3809\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3810\u001b[0m     ):\n\u001b[1;32m   3811\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3814\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3815\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3816\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3817\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'p_correct'"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "# Set the group number\n",
+    "group_number = 1  # Change this to the desired group number\n",
+    "\n",
+    "# Load the CSV files from the specified group\n",
+    "sdl_class_rdoc_path = f'0.class_document/{group_number}/sdl_class_rdoc.csv'\n",
+    "test_path = f'0.class_document/{group_number}/test_p_c.csv'\n",
+    "\n",
+    "sdl_class_rdoc_csv = pd.read_csv(sdl_class_rdoc_path, low_memory=False)\n",
+    "test_csv = pd.read_csv(test_path, low_memory=False)\n",
+    "\n",
+    "update_count = 0\n",
+    "duplicate_count = 0\n",
+    "non_duplicate_count = 0\n",
+    "\n",
+    "# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\n",
+    "for index, row in test_csv.iterrows():\n",
+    "    if not row['p_correct'] and row['ctp_correct']:\n",
+    "        update_count += 1  # Increment the counter\n",
+    "\n",
+    "        # Check for duplicates within the same ships_idx\n",
+    "        same_idx_rows = test_csv[(test_csv['ships_idx'] == row['ships_idx']) &\n",
+    "                                 (test_csv['p_thing'] == row['c_thing']) &\n",
+    "                                 (test_csv['p_property'] == row['c_property'])]\n",
+    "\n",
+    "        if len(same_idx_rows) > 0:\n",
+    "            duplicate_count += 1\n",
+    "        else:\n",
+    "            non_duplicate_count += 1\n",
+    "\n",
+    "# Print the results\n",
+    "print(f\"Total updates where p_correct is False and ctp_correct is True: {update_count}\")\n",
+    "print(f\"Number of rows with duplicates in the same ships_idx: {duplicate_count}\")\n",
+    "print(f\"Number of rows without duplicates in the same ships_idx: {non_duplicate_count}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of updates made: 45\n",
+      "Updated test CSV saved to 0.class_document/1/test_p_c_r.csv\n",
+      "Refine CSV saved to refine.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "update_count = 0\n",
+    "\n",
+    "# Initialize a list to hold rows that meet the conditions\n",
+    "refine_rows = []\n",
+    "\n",
+    "# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\n",
+    "for index, row in test_csv.iterrows():\n",
+    "    if (not row['p_MDM'] and row['c_score'] >= 0.9 and \n",
+    "        (row['p_thing'] != row['c_thing'] or row['p_property'] != row['c_property'])):\n",
+    "        test_csv.at[index, 'p_thing'] = row['c_thing']\n",
+    "        test_csv.at[index, 'p_property'] = row['c_property']\n",
+    "        test_csv.at[index, 'p_MDM'] = True\n",
+    "        update_count += 1  # Increment the counter\n",
+    "        refine_rows.append(row)  # Add the row to the refine list\n",
+    "\n",
+    "# Convert the list of refine rows into a DataFrame\n",
+    "refine_df = pd.DataFrame(refine_rows)\n",
+    "\n",
+    "# Save the refine DataFrame to a CSV file\n",
+    "refine_output_path = f'refine.csv'\n",
+    "refine_df.to_csv(refine_output_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "# Print the number of updates made\n",
+    "print(f\"Number of updates made: {update_count}\")\n",
+    "\n",
+    "# Save the updated test CSV\n",
+    "output_file_path = f'0.class_document/{group_number}/test_p_c_r.csv'\n",
+    "test_csv.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
+    "    \n",
+    "print(f\"Updated test CSV saved to {output_file_path}\")\n",
+    "print(f\"Refine CSV saved to {refine_output_path}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/post_process/tfidf_class/4.selection_by_tfidf.py b/post_process/tfidf_class/4.selection_by_tfidf.py
new file mode 100644
index 0000000..0af7bc6
--- /dev/null
+++ b/post_process/tfidf_class/4.selection_by_tfidf.py
@@ -0,0 +1,114 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import tqdm
+import os
+
+group_number = 1
+# Load the CSV files
+test_path = f'post_process/tfidf_class/0.class_document/{group_number}/test_p_c.csv'
+test_path = f'post_process/tfidf_class/0.class_document/{group_number}/test_p_c_r.csv'
+ship_data_list_reference_doc_file_path = f'post_process/tfidf_class/0.class_document/{group_number}/sdl_class_rdoc.csv'
+
+test_csv = pd.read_csv(test_path, low_memory=False)
+sdl_rdoc = pd.read_csv(ship_data_list_reference_doc_file_path)
+
+# Initialize new columns in test_csv
+test_csv['s_score'] = -1
+test_csv['s_thing'] = ''
+test_csv['s_property'] = ''
+test_csv['s_correct'] = False
+
+duplicate_filtered = test_csv[(test_csv['p_MDM'] == True)].copy()
+
+# Create a mapping from thing/property to reference_doc
+thing_property_to_reference_doc = sdl_rdoc.set_index(['thing', 'property'])['tag_description'].to_dict()
+
+# Calculate s_score for duplicate rows
+for ships_idx, group in tqdm(duplicate_filtered.groupby('ships_idx'), desc="Processing duplicates"):
+    for (p_thing, p_property), sub_group in group.groupby(['p_thing', 'p_property']):
+        sub_group = sub_group.copy()
+        tag_descriptions = sub_group['tag_description'].tolist()
+        
+        # Get the reference document for the corresponding p_thing and p_property
+        reference_doc = thing_property_to_reference_doc.get((p_thing, p_property), '')
+        
+        if reference_doc:
+            # Combine the tag_descriptions and the reference_doc for fit_transform
+            combined_descriptions = tag_descriptions + [reference_doc]
+            
+            # Create a new TF-IDF Vectorizer for this specific group
+            vectorizer = TfidfVectorizer(
+                token_pattern=r'\S+',
+                norm='l2',  # Use L2 normalization
+                ngram_range=(1, 7),  # Use both unigrams and bigrams
+            )
+
+            # Fit and transform the combined descriptions
+            tfidf_matrix = vectorizer.fit_transform(combined_descriptions)
+            
+            # Separate the test_tfidf_matrix and reference_vector
+            test_tfidf_matrix = tfidf_matrix[:-1]  # All but the last one
+            reference_vector = tfidf_matrix[-1]    # The last one
+            
+            # Calculate the cosine similarity between the test descriptions and the reference_doc
+            sub_group['s_score'] = cosine_similarity(test_tfidf_matrix, reference_vector).flatten()
+        else:
+            sub_group['s_score'] = 0
+        
+        # Update the s_score values back into the original test_csv
+        duplicate_filtered.loc[sub_group.index, 's_score'] = sub_group['s_score']
+   
+for ships_idx, group in tqdm(duplicate_filtered.groupby('ships_idx'), desc="Processing duplicates"):
+    for (p_thing, p_property), sub_group in group.groupby(['p_thing', 'p_property']):
+        if (sub_group['s_score'] == -1).any():
+            best_index = sub_group.index.min()
+        else:
+            # Find the index of the row with the highest s_score
+            best_index = sub_group['s_score'].idxmax()
+            row_position = sub_group.index.get_loc(best_index)
+
+        # Assign s_thing and s_property only to the row with the highest s_score
+        duplicate_filtered.at[best_index, 's_thing'] = sub_group.at[best_index, 'p_thing']
+        duplicate_filtered.at[best_index, 's_property'] = sub_group.at[best_index, 'p_property']
+
+# Now, update the original test_csv with the changes made in duplicate_filtered
+test_csv.update(duplicate_filtered[['s_thing', 's_property', 's_score']])
+
+# Calculate s_correct
+test_csv['s_correct'] = ((test_csv['thing'] == test_csv['s_thing']) & 
+                         (test_csv['property'] == test_csv['s_property']) & 
+                         (test_csv['MDM']))
+
+# Calculate the percentage of correct s_thing and s_property
+mdm_true_count = test_csv['MDM'].sum()
+s_correct_count = test_csv['s_correct'].sum()
+s_correct_percentage = (s_correct_count / mdm_true_count) * 100
+
+print(f"s_correct count: {s_correct_count}")
+print(f"MDM true count: {mdm_true_count}")
+print(f"s_correct percentage: {s_correct_percentage:.2f}%")
+
+
+# Save the updated DataFrame to a new CSV file
+output_path = test_path = f'post_process/0.result/{group_number}/test_s.csv'
+os.makedirs(os.path.dirname(output_path), exist_ok=True)
+test_csv.to_csv(output_path, index=False, encoding='utf-8-sig')
+
+print(f"Updated data saved to {output_path}")
+
+# Check for duplicates in s_thing and s_property within each ships_idx
+print("\nShips_idx with duplicate s_thing and s_property:")
+duplicate_ships_idx = []
+
+for ships_idx, group in test_csv.groupby('ships_idx'):
+    # Exclude rows with empty s_thing or s_property
+    non_empty_group = group[(group['s_thing'] != '') & (group['s_property'] != '')]
+    duplicate_entries = non_empty_group[non_empty_group.duplicated(subset=['s_thing', 's_property'], keep=False)]
+    if not duplicate_entries.empty:
+        duplicate_ships_idx.append(ships_idx)
+        print(f"Ships_idx: {ships_idx}")
+        print(duplicate_entries[['s_thing', 's_property']])
+
+if not duplicate_ships_idx:
+    print("No duplicates found.")
diff --git a/translation/t5/1.data_process_concat.ipynb b/translation/t5/1.data_process_concat.ipynb
new file mode 100644
index 0000000..fafd5a2
--- /dev/null
+++ b/translation/t5/1.data_process_concat.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded data for group 1:\n",
+      "Train data shape: (6125, 16)\n",
+      "Valid data shape: (2042, 16)\n",
+      "Test data shape: (14719, 15)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "# Example usage:1\n",
+    "group_number = 1  # You can change this to any group number you want to load (1, 2, 3, 4, or 5)\n",
+    "\n",
+    "# Select the mode for processing\n",
+    "mode = 'tn_td_unit'  # Change this to 'only_td', 'tn_td', etc., as needed\n",
+    "\n",
+    "def load_group_data(group_number):\n",
+    "    # Define the folder path based on the group number\n",
+    "    group_folder = os.path.join('../../data_preprocess/dataset', str(group_number))\n",
+    "    \n",
+    "    # Define file paths for train, valid, and test datasets\n",
+    "    train_file_path = os.path.join(group_folder, 'train.csv')\n",
+    "    valid_file_path = os.path.join(group_folder, 'valid.csv')\n",
+    "    test_file_path = os.path.join(group_folder, 'test.csv')\n",
+    "    \n",
+    "    # Check if the files exist\n",
+    "    if not os.path.exists(train_file_path) or not os.path.exists(valid_file_path) or not os.path.exists(test_file_path):\n",
+    "        raise FileNotFoundError(f\"One or more files for group {group_number} do not exist.\")\n",
+    "    \n",
+    "    # Load the CSV files into DataFrames\n",
+    "    train_data = pd.read_csv(train_file_path)\n",
+    "    valid_data = pd.read_csv(valid_file_path)\n",
+    "    test_data = pd.read_csv(test_file_path)\n",
+    "    \n",
+    "    return train_data, valid_data, test_data\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    train_data, valid_data, test_data = load_group_data(group_number)\n",
+    "    print(f\"Loaded data for group {group_number}:\")\n",
+    "    print(f\"Train data shape: {train_data.shape}\")\n",
+    "    print(f\"Valid data shape: {valid_data.shape}\")\n",
+    "    print(f\"Test data shape: {test_data.shape}\")\n",
+    "except FileNotFoundError as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "313f98ef12eb442bac319282e5ffe5d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/6125 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0c1834a4e7264a969085ad609320fdd6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/14719 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "464f88daab334658aac93305ea6dac71",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/2042 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset saved to 'combined_data'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "\n",
+    "# Function to process DataFrame based on mode\n",
+    "def process_df(df, mode='only_td'):\n",
+    "    output_list = []\n",
+    "    for idx, row in df.iterrows():\n",
+    "        try:\n",
+    "            if mode == 'only_td':\n",
+    "                input_str = f\"<TD_START>{str(row['tag_description'])}<TD_END>\"\n",
+    "            elif mode == 'tn_td':\n",
+    "                input_str = f\"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END>\"\n",
+    "            elif mode == 'tn_td_min_max':\n",
+    "                input_str = f\"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>\"\n",
+    "            elif mode == 'td_min_max':\n",
+    "                input_str = f\"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>\"    \n",
+    "            elif mode == 'td_unit':\n",
+    "                input_str = f\"<TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>\"      \n",
+    "            elif mode == 'tn_td_unit':\n",
+    "                input_str = f\"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>\"                     \n",
+    "            else:\n",
+    "                raise ValueError(\"Invalid mode specified\")\n",
+    "            \n",
+    "            output_list.append({\n",
+    "                'translation': {\n",
+    "                    'ships_idx': row['ships_idx'],\n",
+    "                    'input': input_str,\n",
+    "                    'thing_property': f\"<THING_START>{str(row['thing'])}<THING_END><PROPERTY_START>{str(row['property'])}<PROPERTY_END>\",\n",
+    "                    'answer': f\"{str(row['thing'])} {str(row['property'])}\",\n",
+    "                }\n",
+    "            })\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error processing row at index {idx}: {row}\")\n",
+    "            print(f\"Exception: {e}\")\n",
+    "    return output_list\n",
+    "\n",
+    "\n",
+    "# Combine the mode and group information into a single dictionary\n",
+    "combined_dict = {\n",
+    "    \"mode\": mode,\n",
+    "    \"fold_group\": group_number\n",
+    "}\n",
+    "\n",
+    "# Save the combined dictionary to a JSON file\n",
+    "with open(\"mode.json\", \"w\") as json_file:\n",
+    "    json.dump(combined_dict, json_file)\n",
+    "    \n",
+    "try:\n",
+    "    # Process the data and create a DatasetDict\n",
+    "    combined_data = DatasetDict({\n",
+    "        'train': Dataset.from_list(process_df(train_data, mode=mode)),\n",
+    "        'test': Dataset.from_list(process_df(test_data, mode=mode)),\n",
+    "        'validation': Dataset.from_list(process_df(valid_data, mode=mode)),\n",
+    "    })\n",
+    "    # Save the DatasetDict to disk\n",
+    "    combined_data.save_to_disk(f\"combined_data/{mode}/{group_number}\")\n",
+    "    print(\"Dataset saved to 'combined_data'\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error creating DatasetDict: {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/translation/t5/2.t5_train.ipynb b/translation/t5/2.t5_train.ipynb
new file mode 100644
index 0000000..ce98df6
--- /dev/null
+++ b/translation/t5/2.t5_train.ipynb
@@ -0,0 +1,477 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# t5 training for combined concatenated outputs (thing + property) \n",
+    "\n",
+    "refer to `t5_train_tp.py` and `guide_for_tp.md` for faster training workflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The mode has been set to: tn_td_unit\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d8d70681f4594917b7af4583a4237168",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/6125 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "106e0cefe50c40f0a83371693cf48cf7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/14719 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "952f8ec73df0418490cb43beaaf5a7df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2042 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# import data and load dataset\n",
+    "from datasets import load_from_disk\n",
+    "import json\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_name = \"t5-base\"\n",
+    "train_epochs = 80\n",
+    "\n",
+    "\n",
+    "# Read the mode from the JSON file\n",
+    "with open(\"mode.json\", \"r\") as json_file:\n",
+    "    mode_dict = json.load(json_file)\n",
+    "\n",
+    "# Add the model key to the dictionary\n",
+    "mode_dict[\"model\"] = model_name\n",
+    "mode_dict[\"train_epochs\"] = train_epochs\n",
+    "\n",
+    "# Access the fold_group value\n",
+    "fold_group = mode_dict.get(\"fold_group\")\n",
+    "\n",
+    "# Save the updated dictionary back to the JSON file\n",
+    "with open(\"mode.json\", \"w\") as json_file:\n",
+    "    json.dump(mode_dict, json_file)\n",
+    "\n",
+    "# Set the mode variable from the JSON content\n",
+    "mode = mode_dict.get(\"mode\", \"default_value\")  # 'default_value' is a fallback if 'mode' is not found\n",
+    "\n",
+    "print(f\"The mode has been set to: {mode}\")\n",
+    "\n",
+    "# Path to saved combined_dataset\n",
+    "file_path = f'combined_data/{mode}/{fold_group}'\n",
+    "split_datasets = load_from_disk(file_path)\n",
+    "\n",
+    "\n",
+    "    \n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "# Define additional special tokens\n",
+    "# additional_special_tokens = [\"<THING_START>\", \"<THING_END>\", \"<PROPERTY_START>\", \"<PROPERTY_END>\"]\n",
+    "additional_special_tokens = [\"<THING_START>\", \"<THING_END>\", \"<PROPERTY_START>\", \"<PROPERTY_END>\", \"<TN_START>\", \"<TN_END>\", \"<TD_START>\", \"<TD_END>\", \"<MIN_START>\", \"<MIN_END>\", \"<MAX_START>\", \"<MAX_END>\", \"<UNIT_START>\", \"<UNIT_END>\"]\n",
+    "# Add the additional special tokens to the tokenizer\n",
+    "tokenizer.add_special_tokens({\"additional_special_tokens\": additional_special_tokens})\n",
+    "\n",
+    "max_length = 64\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    inputs = [ex[\"input\"] for ex in examples['translation']]\n",
+    "    targets = [ex[\"thing_property\"] for ex in examples['translation']]\n",
+    "    # text_target sets the corresponding label to inputs\n",
+    "    # there is no need to create a separate 'labels'\n",
+    "    model_inputs = tokenizer(\n",
+    "        inputs, text_target=targets, max_length=max_length, truncation=True\n",
+    "    )\n",
+    "    return model_inputs\n",
+    "\n",
+    "# map method maps preprocess_function to [train, valid, test] datasets of the datasetDict\n",
+    "tokenized_datasets = split_datasets.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    remove_columns=split_datasets[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='3840' max='3840' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [3840/3840 42:37, Epoch 80/80]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>2.812300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.699300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>0.440900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>0.332100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>0.276500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>0.245900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>0.229300</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=3840, training_loss=0.6754856963952383, metrics={'train_runtime': 2559.4201, 'train_samples_per_second': 191.45, 'train_steps_per_second': 1.5, 'total_flos': 3.156037495934976e+16, 'train_loss': 0.6754856963952383, 'epoch': 80.0})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import json\n",
+    "\n",
+    "# we use the pre-trained t5-base model\n",
+    "from transformers import AutoModelForSeq2SeqLM\n",
+    "model_checkpoint = model_name\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
+    "\n",
+    "# data collator\n",
+    "from transformers import DataCollatorForSeq2Seq\n",
+    "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+    "\n",
+    "# evaluation \n",
+    "import evaluate\n",
+    "metric = evaluate.load(\"sacrebleu\")\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def compute_metrics(eval_preds):\n",
+    "    preds, labels = eval_preds\n",
+    "    # In case the model returns more than the prediction logits\n",
+    "    if isinstance(preds, tuple):\n",
+    "        preds = preds[0]\n",
+    "\n",
+    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+    "\n",
+    "    # Replace -100s in the labels as we can't decode them\n",
+    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+    "\n",
+    "    # Some simple post-processing\n",
+    "    decoded_preds = [pred.strip() for pred in decoded_preds]\n",
+    "    decoded_labels = [[label.strip()] for label in decoded_labels]\n",
+    "\n",
+    "    result = metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
+    "    return {\"bleu\": result[\"score\"]}\n",
+    "\n",
+    "from transformers import Seq2SeqTrainingArguments\n",
+    "\n",
+    "# load environment variables to disable GPU p2p mode for multi-gpu training without p2p mode\n",
+    "# not required for single-gpu training\n",
+    "import os\n",
+    "os.environ['NCCL_P2P_DISABLE'] = '1'\n",
+    "os.environ['NCCL_IB_DISABLE'] = '1'\n",
+    "\n",
+    "args = Seq2SeqTrainingArguments(\n",
+    "    f\"train_{fold_group}_{model_name}_{mode}_{train_epochs}\",\n",
+    "    evaluation_strategy=\"no\",\n",
+    "    # logging_dir=\"tensorboard-log\",\n",
+    "    # logging_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=32,\n",
+    "    per_device_eval_batch_size=64,\n",
+    "    auto_find_batch_size=True,\n",
+    "    ddp_find_unused_parameters=False,\n",
+    "    weight_decay=0.01,\n",
+    "    save_total_limit=1,\n",
+    "    num_train_epochs=train_epochs,\n",
+    "    predict_with_generate=True,\n",
+    "    bf16=True,\n",
+    "    push_to_hub=False,\n",
+    ")\n",
+    "\n",
+    "from transformers import Seq2SeqTrainer\n",
+    "\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    model,\n",
+    "    args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
+    "    data_collator=data_collator,\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "\n",
+    "# Train the model\n",
+    "trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/translation/t5/3.produce_test_predictions.ipynb b/translation/t5/3.produce_test_predictions.ipynb
new file mode 100644
index 0000000..6c773c4
--- /dev/null
+++ b/translation/t5/3.produce_test_predictions.ipynb
@@ -0,0 +1,447 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Goal: end to end inference and evaluation\n",
+    "\n",
+    "given a csv, make predictions and evaluate predictions, then return results in a csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The mode has been set to: tn_td_unit t5-base\n",
+      "Using model checkpoint: train_1_t5-base_tn_td_unit_80/checkpoint-3840\n",
+      "Columns in df_org:\n",
+      "['thing', 'property', 'ships_idx', 'tag_name', 'tag_description', 'signal_type', 'min', 'max', 'unit', 'data_type', 'thing_pattern', 'property_pattern', 'pattern', 'MDM', 'org_tag_description']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "import json\n",
+    "\n",
+    "# Read the mode from the JSON file\n",
+    "with open(\"mode.json\", \"r\") as json_file:\n",
+    "    mode_dict = json.load(json_file)\n",
+    "\n",
+    "\n",
+    "# Set the mode variable from the JSON content\n",
+    "mode = mode_dict.get(\"mode\", \"none\")  # 'default_value' is a fallback if 'mode' is not found\n",
+    "model_name = mode_dict.get(\"model\", \"none\")  # 'default_value' is a fallback if 'mode' is not found\n",
+    "train_epochs = mode_dict.get(\"train_epochs\", \"none\")  # 'default_value' is a fallback if 'mode' is not found\n",
+    "fold_group = mode_dict.get(\"fold_group\", \"none\")  # 'default_value' is a fallback if 'mode' is not found\n",
+    "\n",
+    "print(f\"The mode has been set to: {mode} {model_name}\")\n",
+    "\n",
+    "# Define the base directory where checkpoints are stored\n",
+    "base_dir = f\"train_{fold_group}_{model_name}_{mode}_{train_epochs}\"\n",
+    "\n",
+    "# List all subdirectories in the base directory\n",
+    "subdirectories = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]\n",
+    "\n",
+    "# Filter for checkpoint directories that match the pattern \"checkpoint-\"\n",
+    "checkpoints = [d for d in subdirectories if d.startswith(\"checkpoint-\")]\n",
+    "\n",
+    "# Select the latest checkpoint (the one with the highest number)\n",
+    "if checkpoints:\n",
+    "    latest_checkpoint = checkpoints[0]\n",
+    "    model_checkpoint = os.path.join(base_dir, latest_checkpoint)\n",
+    "    print(f\"Using model checkpoint: {model_checkpoint}\")\n",
+    "else:\n",
+    "    print(\"No checkpoints were found.\")\n",
+    "    model_checkpoint = None  # Handle this case as needed\n",
+    "\n",
+    "# Load the data\n",
+    "data_path = f\"../../data_preprocess/dataset/{fold_group}/test.csv\"  # Adjust the CSV file path as necessary\n",
+    "\n",
+    "try:\n",
+    "    df = pd.read_csv(data_path)\n",
+    "except UnicodeDecodeError:\n",
+    "    df = pd.read_csv(data_path, encoding='ISO-8859-1')\n",
+    "\n",
+    "\n",
+    "# Drop rows where 'tag_description' is NaN and reset the index\n",
+    "df = df.dropna(subset=['tag_description']).reset_index(drop=True)\n",
+    "\n",
+    "# Preserve df_org\n",
+    "df_org = df.copy()\n",
+    "\n",
+    "# Print the column names of df_org\n",
+    "print(\"Columns in df_org:\")\n",
+    "print(df_org.columns.tolist())\n",
+    "\n",
+    "selected_columns = ['thing', 'property', 'tag_description', 'min', 'max', 'MDM', 'pattern']\n",
+    "df[selected_columns] = df[selected_columns].astype(\"string\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The test_dataset contains 14718 items.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "def process_df(df, mode='only_td'):\n",
+    "    output_list = []\n",
+    "    for _, row in df.iterrows():\n",
+    "        try:\n",
+    "            if mode == 'only_td':\n",
+    "                input_str = f\"<TD_START>{str(row['tag_description'])}<TD_END>\"\n",
+    "            elif mode == 'tn_td':\n",
+    "                input_str = f\"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END>\"\n",
+    "            elif mode == 'tn_td_min_max':\n",
+    "                input_str = f\"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>\"\n",
+    "            elif mode == 'td_min_max':\n",
+    "                input_str = f\"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>\"  \n",
+    "            elif mode == 'td_unit':\n",
+    "                input_str = f\"<TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>\"            \n",
+    "            elif mode == 'tn_td_unit':\n",
+    "                input_str = f\"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>\"      \n",
+    "            else:\n",
+    "                raise ValueError(\"Invalid mode specified\")\n",
+    "\n",
+    "            output_list.append({\n",
+    "                'translation': {\n",
+    "                    'ships_idx': row['ships_idx'],\n",
+    "                    'input': input_str,\n",
+    "                    'thing_property': f\"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>\",\n",
+    "                    'answer_thing': f\"{row['thing']}\",\n",
+    "                    'answer_property': f\"{row['property']}\",\n",
+    "                    'MDM': f\"{row['MDM']}\",\n",
+    "                }\n",
+    "            })\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error processing row: {row}\")\n",
+    "            print(f\"Exception: {e}\")\n",
+    "    return output_list\n",
+    "\n",
+    "\n",
+    "# Process the DataFrame\n",
+    "processed_data = process_df(df, mode=mode)\n",
+    "\n",
+    "# Create a Dataset object\n",
+    "test_dataset = Dataset.from_list(processed_data)\n",
+    "\n",
+    "# Print the number of items in the dataset\n",
+    "print(f\"The test_dataset contains {len(test_dataset)} items.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers.pipelines.pt_utils import KeyDataset\n",
+    "from transformers import pipeline\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors=\"pt\")\n",
+    "# Define additional special tokens\n",
+    "# additional_special_tokens = [\"<THING_START>\", \"<THING_END>\", \"<PROPERTY_START>\", \"<PROPERTY_END>\"]\n",
+    "additional_special_tokens = [\"<THING_START>\", \"<THING_END>\", \"<PROPERTY_START>\", \"<PROPERTY_END>\", \"<TN_START>\", \"<TN_END>\", \"<TD_START>\", \"<TD_END>\", \"<MIN_START>\", \"<MIN_END>\", \"<MAX_START>\", \"<MAX_END>\", \"<UNIT_START>\", \"<UNIT_END>\"]\n",
+    "\n",
+    "# Add the additional special tokens to the tokenizer\n",
+    "tokenizer.add_special_tokens({\"additional_special_tokens\": additional_special_tokens})\n",
+    "# tokenizer.add_special_tokens({'sep_token': \"<SEP>\"})\n",
+    "\n",
+    "\n",
+    "pipe = pipeline(\"translation_XX_to_YY\", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=128, device=0)\n",
+    "\n",
+    "# check what token-ids the special tokens are\n",
+    "# tokenizer.encode(\"<THING_START><THING_END><PROPERTY_START><PROPERTY_END>\")\n",
+    "\n",
+    "def extract_seq(tokens, start_value, end_value):\n",
+    "    if start_value not in tokens or end_value not in tokens:\n",
+    "        return None  # Or handle this case according to your requirements\n",
+    "    start_id = tokens.index(start_value)\n",
+    "    end_id = tokens.index(end_value)\n",
+    "\n",
+    "    return tokens[start_id+1:end_id]\n",
+    "\n",
+    "# problem, what if end tokens are not in?\n",
+    "def process_tensor_output(output):\n",
+    "    tokens = output[0]['translation_token_ids'].tolist()\n",
+    "    thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>\n",
+    "    property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>\n",
+    "    p_thing = None\n",
+    "    p_property = None\n",
+    "    if (thing_seq is not None):\n",
+    "        p_thing =  tokenizer.decode(thing_seq)\n",
+    "    if (property_seq is not None):\n",
+    "        p_property =  tokenizer.decode(property_seq)\n",
+    "    return p_thing, p_property"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "making inference on test set\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "14718it [00:44, 330.24it/s]                   "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "inference done\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "p_thing_list = []\n",
+    "p_property_list = []\n",
+    "print(\"making inference on test set\")\n",
+    "for out in tqdm(pipe(KeyDataset(test_dataset[\"translation\"], \"input\"), batch_size=256)):\n",
+    "    p_thing, p_property = process_tensor_output(out)\n",
+    "    p_thing_list.append(p_thing)\n",
+    "    p_property_list.append(p_property)\n",
+    "print(\"inference done\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Thing prediction accuracy: 0.9895314057826521\n",
+      "Correct thing predictions: 1985, Incorrect thing predictions: 21\n",
+      "Property prediction accuracy: 0.9661016949152542\n",
+      "Correct property predictions: 1938, Incorrect property predictions: 12780\n",
+      "total accuracy: 0.9596211365902293\n",
+      "Correct total predictions: 1925, Incorrect total predictions: 81\n"
+     ]
+    }
+   ],
+   "source": [
+    "answer_thing = [item['answer_thing'] for item in test_dataset[\"translation\"]]\n",
+    "answer_property = [item['answer_property'] for item in test_dataset[\"translation\"]]\n",
+    "mdm_list = [item['MDM'] for item in test_dataset[\"translation\"]]\n",
+    "\n",
+    "mdm_count = 0\n",
+    "for i in range(len(mdm_list)):\n",
+    "    if(mdm_list[i] == \"True\"):mdm_count = mdm_count + 1    \n",
+    "\n",
+    "def correctness_test(input, reference, mdm_list):\n",
+    "    assert(len(input) == len(reference))\n",
+    "    correctness_list = []\n",
+    "    for i in range(len(input)):\n",
+    "        if(mdm_list[i] == \"True\"):\n",
+    "            correctness_list.append(input[i] == reference[i])\n",
+    "        else:correctness_list.append(False)\n",
+    "    return correctness_list\n",
+    "\n",
+    "# Compare with answer to evaluate correctness\n",
+    "thing_correctness = correctness_test(p_thing_list, answer_thing, mdm_list)\n",
+    "property_correctness = correctness_test(p_property_list, answer_property, mdm_list)\n",
+    "\n",
+    "correctness_mdm = []\n",
+    "for i in range(len(mdm_list)):\n",
+    "    if(thing_correctness[i] & property_correctness[i]):\n",
+    "        correctness_mdm.append(True)\n",
+    "    else:            \n",
+    "        correctness_mdm.append(False)\n",
+    "        \n",
+    "    \n",
+    "# Calculate accuracy\n",
+    "thing_accuracy = sum(thing_correctness) / mdm_count\n",
+    "property_accuracy = sum(property_correctness) / mdm_count\n",
+    "total_accuracy = sum(correctness_mdm) / mdm_count\n",
+    "\n",
+    "# Count True/False values\n",
+    "thing_true_count = thing_correctness.count(True)\n",
+    "thing_false_count = 0\n",
+    "for i in range(len(thing_correctness)):\n",
+    "    if mdm_list[i] == \"True\" and thing_correctness[i] == False:\n",
+    "        thing_false_count += 1\n",
+    "\n",
+    "property_true_count = property_correctness.count(True)\n",
+    "property_false_count = property_correctness.count(False)\n",
+    "total_true_count = correctness_mdm.count(True)\n",
+    "total_false_count = mdm_count - correctness_mdm.count(True)\n",
+    "\n",
+    "# Print results\n",
+    "print(\"Thing prediction accuracy:\", thing_accuracy)\n",
+    "print(f\"Correct thing predictions: {thing_true_count}, Incorrect thing predictions: {thing_false_count}\")\n",
+    "print(\"Property prediction accuracy:\", property_accuracy)\n",
+    "print(f\"Correct property predictions: {property_true_count}, Incorrect property predictions: {property_false_count}\")\n",
+    "print(\"total accuracy:\", total_accuracy)\n",
+    "print(f\"Correct total predictions: {total_true_count}, Incorrect total predictions: {total_false_count}\")\n",
+    "\n",
+    "# Create a DataFrame with the results\n",
+    "dict = {\n",
+    "    'p_thing': p_thing_list,\n",
+    "    'p_property': p_property_list,\n",
+    "    'p_thing_correct': thing_correctness,\n",
+    "    'p_property_correct': property_correctness\n",
+    "}\n",
+    "\n",
+    "df_pred = pd.DataFrame(dict)\n",
+    "\n",
+    "# Read the mode from the JSON file\n",
+    "with open(\"mode.json\", \"r\") as json_file:\n",
+    "    mode_dict = json.load(json_file)\n",
+    "\n",
+    "# Add the model key to the dictionary\n",
+    "mode_dict[\"model\"] = model_name\n",
+    "mode_dict[\"train_epochs\"] = train_epochs\n",
+    "\n",
+    "# Save the updated dictionary back to the JSON file\n",
+    "with open(\"mode.json\", \"w\") as json_file:\n",
+    "    json.dump(mode_dict, json_file)\n",
+    "\n",
+    "\n",
+    "# Check if the file exists and is not empty\n",
+    "if os.path.exists(\"results.json\") and os.path.getsize(\"results.json\") > 0:\n",
+    "    # Read the existing results.json file\n",
+    "    with open(\"results.json\", \"r\") as json_file:\n",
+    "        try:\n",
+    "            results_dict = json.load(json_file)\n",
+    "        except json.JSONDecodeError:\n",
+    "            results_dict = {}\n",
+    "else:\n",
+    "    results_dict = {}\n",
+    "\n",
+    "# Add the new model_checkpoint key with the accuracy values as an object\n",
+    "\n",
+    "model_key = model_checkpoint \n",
+    "\n",
+    "results_dict[model_key] = {\n",
+    "    \"thing_accuracy\": thing_accuracy,\n",
+    "    \"thing_true\": thing_true_count,\n",
+    "    \"thing_false\": thing_false_count,\n",
+    "    \"property_accuracy\": property_accuracy,\n",
+    "    \"property_true\": property_true_count,\n",
+    "    \"property_false\": property_false_count,\n",
+    "    \"total_accuracy\": total_accuracy,\n",
+    "    \"total_true\": total_true_count,\n",
+    "    \"total_false\": total_false_count    \n",
+    "}\n",
+    "\n",
+    "# Save the updated dictionary back to the results.json file\n",
+    "with open(\"results.json\", \"w\") as json_file:\n",
+    "    json.dump(results_dict, json_file, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Updated data saved to ../0.result/1/test_p.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Create a DataFrame with the results\n",
+    "df_pred = pd.DataFrame({\n",
+    "    'p_thing': p_thing_list,\n",
+    "    'p_property': p_property_list,\n",
+    "    'p_thing_correct': thing_correctness,\n",
+    "    'p_property_correct': property_correctness,\n",
+    "})\n",
+    "\n",
+    "# Merge predictions with the original DataFrame (df_org)\n",
+    "df_org['p_thing'] = df_pred['p_thing']\n",
+    "df_org['p_property'] = df_pred['p_property']\n",
+    "df_org['p_thing_correct'] = df_pred['p_thing_correct']\n",
+    "df_org['p_property_correct'] = df_pred['p_property_correct']\n",
+    "df_org['p_correct'] = df_pred['p_thing_correct'] & df_org['p_property_correct']\n",
+    "\n",
+    "df_master = pd.read_csv('../../data_import/data_model_master_export.csv')\n",
+    "\n",
+    "df_org['pattern'] = df_org['thing'].str.replace(r'\\d', '#', regex=True) + \" \" + df_org['property'].str.replace(r'\\d', '#', regex=True)\n",
+    "df_org['p_pattern'] = df_org['p_thing'].str.replace(r'\\d', '#', regex=True) + \" \" + df_org['p_property'].str.replace(r'\\d', '#', regex=True)\n",
+    "df_master['master_pattern'] = df_master['thing'] + \" \" + df_master['property']\n",
+    "\n",
+    "# Create a set of unique patterns from master for fast lookup\n",
+    "master_patterns = set(df_master['master_pattern'])\n",
+    "df_org['p_MDM'] =  df_org['p_pattern'].apply(lambda x: x in master_patterns)\n",
+    "\n",
+    "\n",
+    "output_path = f\"../0.result/{fold_group}/test_p.csv\"\n",
+    "debug_output_path = f\"0.dresult/{fold_group}/test_p.csv\"\n",
+    "\n",
+    "# 폴더가 없으면 생성\n",
+    "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
+    "df_org.to_csv(output_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "os.makedirs(os.path.dirname(debug_output_path), exist_ok=True)\n",
+    "df_org.to_csv(debug_output_path, index=False, encoding='utf-8-sig')\n",
+    "\n",
+    "print(f\"Updated data saved to {output_path}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}