From 3d2266cf65d26221ec6e007bd6ab5b7325d852cb Mon Sep 17 00:00:00 2001 From: hhs0625 Date: Mon, 26 Aug 2024 19:51:11 +0900 Subject: [PATCH] [TASK] init --- data_import/1.select_db.py | 59 +++ data_import/2.make_csv.py | 38 ++ data_import/plot_count.ipynb | 88 ++++ .../no_preprocess/copy_raw_data.py | 9 + .../1.add_tag_name.ipynb | 133 +++++ .../2.seperate_number.ipynb | 100 ++++ .../rule_base_replacement/3.replacement.ipynb | 123 +++++ data_preprocess/split_data.ipynb | 441 ++++++++++++++++ evaluation/check_accuracy.ipynb | 97 ++++ .../tfidf_class/1.make_sdl_class_document.py | 47 ++ .../tfidf_class/2.classify_by_tfidf.ipynb | 134 +++++ post_process/tfidf_class/3.refine.ipynb | 144 ++++++ .../tfidf_class/4.selection_by_tfidf.py | 114 +++++ translation/t5/1.data_process_concat.ipynb | 198 ++++++++ translation/t5/2.t5_train.ipynb | 477 ++++++++++++++++++ .../t5/3.produce_test_predictions.ipynb | 447 ++++++++++++++++ 16 files changed, 2649 insertions(+) create mode 100644 data_import/1.select_db.py create mode 100644 data_import/2.make_csv.py create mode 100644 data_import/plot_count.ipynb create mode 100644 data_preprocess/no_preprocess/copy_raw_data.py create mode 100644 data_preprocess/rule_base_replacement/1.add_tag_name.ipynb create mode 100644 data_preprocess/rule_base_replacement/2.seperate_number.ipynb create mode 100644 data_preprocess/rule_base_replacement/3.replacement.ipynb create mode 100644 data_preprocess/split_data.ipynb create mode 100644 evaluation/check_accuracy.ipynb create mode 100644 post_process/tfidf_class/1.make_sdl_class_document.py create mode 100644 post_process/tfidf_class/2.classify_by_tfidf.ipynb create mode 100644 post_process/tfidf_class/3.refine.ipynb create mode 100644 post_process/tfidf_class/4.selection_by_tfidf.py create mode 100644 translation/t5/1.data_process_concat.ipynb create mode 100644 translation/t5/2.t5_train.ipynb create mode 100644 translation/t5/3.produce_test_predictions.ipynb diff --git a/data_import/1.select_db.py b/data_import/1.select_db.py new file mode 100644 index 0000000..eefac28 --- /dev/null +++ b/data_import/1.select_db.py @@ -0,0 +1,59 @@ +import psycopg2 +import pandas as pd + +# Function to read the db connection info +def read_db_connection_info(filename="db_connection_info.txt"): + connection_info = {} + try: + with open(filename, 'r') as file: + for line in file: + key, value = line.strip().split('=') + connection_info[key] = value + except Exception as e: + print(f"Failed to read database connection info: {e}") + raise + return connection_info + +# Load the connection info +connection_info = read_db_connection_info() + +try: + # Connect to the database + conn = psycopg2.connect( + host=connection_info["host"], + user=connection_info["user"], + password=connection_info["password"], + dbname=connection_info["database"], + port=connection_info["port"] + ) + # This ensures that resources are cleaned up properly + with conn: + with conn.cursor() as cursor: + # Export data_mapping table + query_mapping = """ + SELECT * FROM data_mapping + WHERE ships_idx BETWEEN 1000 AND 1999 + """ + cursor.execute(query_mapping) + results_mapping = cursor.fetchall() + columns_mapping = [desc[0] for desc in cursor.description] + df_mapping = pd.DataFrame(results_mapping, columns=columns_mapping) + df_mapping.to_csv('data_import/data_mapping.csv', index=False, encoding='utf-8-sig') + + # Export data_master_model table + query_master = """ + SELECT * FROM data_model_master + """ + cursor.execute(query_master) + results_master = cursor.fetchall() + columns_master = [desc[0] for desc in cursor.description] + df_master = pd.DataFrame(results_master, columns=columns_master) + df_master.to_csv('data_import/data_model_master_export.csv', index=False, encoding='utf-8-sig') + + print("Data exported successfully to 'data_import/data_mapping.csv' and 'data_import/data_model_master_export.csv'") + +except (Exception, psycopg2.DatabaseError) as error: + print(f"An error occurred: {error}") +finally: + if conn is not None: + conn.close() diff --git a/data_import/2.make_csv.py b/data_import/2.make_csv.py new file mode 100644 index 0000000..1ce8787 --- /dev/null +++ b/data_import/2.make_csv.py @@ -0,0 +1,38 @@ +import pandas as pd +import re + +# Load the data_mapping CSV file +data_mapping_file_path = 'data_import/data_mapping.csv' # Adjust this path to your actual file location +data_mapping = pd.read_csv(data_mapping_file_path, dtype=str) +df_master = pd.read_csv('data_import/data_model_master_export.csv') + +# Generate patterns +data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True) +data_mapping['property_pattern'] = data_mapping['property'].str.replace(r'\d', '#', regex=True) +data_mapping['pattern'] = data_mapping['thing_pattern'] + " " + data_mapping['property_pattern'] +df_master['master_pattern'] = df_master['thing'] + " " + df_master['property'] + +# Create a set of unique patterns from master for fast lookup +master_patterns = set(df_master['master_pattern']) + +# Check each pattern in data_mapping if it exists in df_master and assign the "MDM" field +data_mapping['MDM'] = data_mapping['pattern'].apply(lambda x: x in master_patterns) + +# Remove specified fields +fields_to_remove = ['equip_type_code', 'tx_period', 'tx_type', 'on_change_yn', 'scaling_const', 'description', 'updated_time', 'status_code', 'is_timeout'] +merged_data = data_mapping.drop(columns=fields_to_remove) + +# Save the updated DataFrame to a new CSV file +output_file_path = 'data_import/raw_data.csv' +merged_data.to_csv(output_file_path, index=False, encoding='utf-8-sig') + +print(f"Updated data saved to {output_file_path}") + +# Filter the DataFrame where MDM is TRUE +data_mapping_mdm_true = merged_data[merged_data['MDM']] + +# Save the filtered DataFrame to a new CSV file +mdm_true_output_file_path = 'data_import/data_mapping_mdm.csv' +data_mapping_mdm_true.to_csv(mdm_true_output_file_path, index=False, encoding='utf-8-sig') + +print(f"MDM TRUE data saved to {mdm_true_output_file_path}") diff --git a/data_import/plot_count.ipynb b/data_import/plot_count.ipynb new file mode 100644 index 0000000..029be47 --- /dev/null +++ b/data_import/plot_count.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA28AAAK7CAYAAABlF7dxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABzZ0lEQVR4nO3deXhU9dn/8c9MMtkTlmHHKDEIKFtRxCrQQgUUl4qIj1aRzYXa6k9ksaI+MFiVIrg8jcuDSw0qojXoo0FcYhQ3FEGwKgpoCBGCgAFCEgJhkjm/P2imiZmsM2dOTvJ+XVeu6+Sc73IPc3OSO9+Z7zgMwzAEAAAAAGjWnFYHAAAAAACoH8UbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANhApNUB2J3P59Pu3buVmJgoh8NhdTgAAAAALGIYhoqLi9WtWzc5naFfJ6N4C9Lu3buVnJxsdRgAAAAAmomdO3fqhBNOCPm4FG9BSkxMlCTl5uaqffv2FkeDlsTr9eqdd97RmDFj5HK5rA4HLQz5BbOQWzAT+QWzhCq3ioqKlJyc7K8RQo3iLUiVL5VMTExUUlKSxdGgJfF6vYqLi1NSUhI/oBBy5BfMQm7BTOQXzBLq3DLr7VQUbyEyZUKO5CsI65yZaSvCOl+LMMBjdQQAAABAk7DbJAAAAADYAMUbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANiArXabTE9P19SpU+ttl5WVpVGjRgW8lpOTo/vvv1/vvPOOfvrpJyUmJur000/XDTfcoMsuu6zpsWWkyu12N7l/03jCPB8AAAAAq9iqeKvkdDrVsWPHWq9HR0cHPL969WpdfvnlKi0tlSQlJSXpwIEDeuedd/TOO+9o6tSpevrpp037XAYAAAAAaCpbvmwyOTlZe/bsqfVr+PDhNfrk5ubqv/7rv1RaWqqhQ4dq69atOnTokA4dOqR58+ZJkp555hktXrw43A8HAAAAAOply+KtKebNm6fDhw+rS5cuWrVqlXr16iVJSkhI0IIFC3TDDTdIku69914dPHjQylABAAAAoIZWUbwdPnxYK1eulCTdeOONatu2bY02c+fOlSQVFRXp//7v/8IYHQAAAADUr1UUbx9//LGOHDkiSRo7dmzANj169NCpp54qSXrnnXfCFhsAAAAANIQtNyz5+eefdcYZZ2jr1q2qqKhQ165ddc455+i6667TiBEjarT/5ptv/Mf9+vWrddx+/frpu+++0+bNmxsd05QJOZKvoNH9GiIzbYUp49raAI/VEQAAAABhZcvirbS0VBs3blS7du10+PBh5ebmKjc3V8uXL9fUqVP1xBNPKDLyPw9t9+7dkqR27dopNja21nG7d+9erX0gZWVlKisr839fVFQkSXJFVkiqCOZh1crraxULpI3j9Vodgem8/36M3lbwWBF+5BfMQm7BTOQXzBKq3DI7N21VvHXr1k3z58/X+PHj1bt3b0VHR6uiokLr1q3T/Pnz9e677+qZZ55RfHy80tLS/P2Ki4slSXFxcXWOX3m9sn0gCxcu1IIFC2qcv3zajnrHb6rVeQNNGdfW8lZbHUHYZGVlWR0CWjDyC2Yht2Am8gtmCTa3Kj+SzCwOwzAMU2cIE5/Pp/Hjx+u1116T0+nUli1bdMopp0iSbrjhBj355JPq3r27du3aVesYd955p+677z5FRUVVW12rKtDKW3Jysi4dtVZSm5A+pkovLc4wZVxb6zvX6ghM5/V6lZWVpdGjR8vlclkdDloY8gtmIbdgJvILZglVbhUVFalDhw46dOiQkpKSQhjhcbZaeauL0+nUkiVL9Nprr8nn8ykzM1MzZ86UJCUmJkqqvxKuvF7ZPpDo6OiAHwLuLY+QfBFNDb9OLqfPlHFtrRXdsF0uFz+gYBryC2Yht2Am8gtmCTa3zM7LFlO8SVLPnj3VoUMHFRQUaPv27f7z3bp1kyQdPHhQR44cqfV9b/n5+dXaN0Z6RqrcbncTom4Ij0njAgAAALCLVrETRtUdJqvuPPlLldf69u1rekwAAAAA0BgtqnjLyclRQcHx7fpTUlL854cNG+ZfbXvrrbcC9s3Ly9N3330nSRozZozJkQIAAABA49imeKtvXxXDMDRnzhxJx9//dtFFF/mvxcfH67LLLpMkPf744zp06FCN/osWLZJ0/P1u48aNC1HUAAAAABAatine8vLyNGTIEC1dulTbt2/3F3M+n0+fffaZxo4dq1dffVWSNH36dPXu3bta/7vvvlvx8fH66aefdPHFF+v777+XJB0+fFh33323/vd//1eSdNddd6ldu3ZhfGQAAAAAUD9bbViyfv16rV+/XtLxXR8TExNVXFxcbev+qVOn6u9//3uNvikpKfrnP/+pyy+/XB999JF69eqlNm3aqKSkRBUVFf6+lat3AAAAANCc2KZ469y5s9LS0vTpp5/qyy+/1M8//6yDBw8qJiZGKSkpOuecczRt2jQNHTq01jEuuOACffXVV1q0aJGysrL0008/qV27dho0aJCmT5/uf2llU0yZkCP5Cprcv7Ey01aEbS5YxOeUNFDavFDi4yIQauQXzEJuwUzkF35pgMfqCMLKNsVbbGysbrrpJt10001BjZOamqonnngiRFEBAAAAQHjY5j1vAAAAANCaUbwBAAAAgA1QvAEAAACADVC8AQAAAIAN2GbDkuYuPSNVbrc7jDN6wjgXLOH1Snmrpb5zJZfL6mjQ0pBfMAu5BTORX2jlWHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2C3yRCZMiFH8hVYMndm2gpL5oXJfE5JA6XNCyWnz+poUJcBHqsjAAAArQArbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGAD7DYZIukZqXK73RbN7rFoXpjK65XyVkt950oul9XRAAAAwGKsvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADVC8AQAAAIANsNtkiEyZkCP5CqwOo5rMtBVWhyAN8FgdAQAAANAisPIGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADZA8QYAAAAANsBukyGSnpEqt9ttdRi/4LE6AAAAAAAhwsobAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAxRsAAAAA2AC7TYbIlAk5kq8gbPNlpq0I21ymG+CxOgIAAACg2WPlDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGzA9rtN/u1vf9PcuXP93xuGUaNNenq6pk6dWu9YWVlZGjVqVJPiSM9IldvtblLfpvGEcS4AAAAAVrN18bZ161YtWLCgwe2dTqc6duxY6/Xo6OhQhAUAAAAAIWfb4s3n82natGk6evSozj77bH366af19klOTtaOHTvMDw4AAAAAQsy273lLS0vT2rVrdfXVV2vMmDFWhwMAAAAAprJl8Zabm6s777xTbrdbDz30kNXhAAAAAIDpbPmyyeuvv16HDx/WY489Vud72AAAAACgpbBd8fbkk08qOztbo0aN0qRJkxrV9+eff9YZZ5yhrVu3qqKiQl27dtU555yj6667TiNGjAgqrikTciRfQVBjNEVm2oqwz4kw8TklDZQ2L5ScPqujabkGeKyOAAAAoEFs9bLJ/Px8zZkzR7GxsVq6dGmj+5eWlmrjxo2KioqSz+dTbm6uli9frpEjR2ratGkqLy83IWoAAAAACJ6tVt6mT5+uQ4cOadGiRTr55JMb3K9bt26aP3++xo8fr969eys6OloVFRVat26d5s+fr3fffVfPPPOM4uPjlZaWVudYZWVlKisr839fVFQkSXJFVkiqaNLjCobXZ6v6G41Q+dzyHJvM67U6Akt4//24va308cM85BbMRH7BLKHKLbNz02EE+lTrZuj555/XNddco1/96ldav369IiP/U3d6PB7/57019uH4fD6NHz9er732mpxOp7Zs2aJTTjml1vZV56rqhRdeUFxcXKPmBgAAANBylJaW6qqrrtKhQ4eUlJQU8vFtUbzt3btXffv2VWFhoT777DMNHjy42vVgijdJ+uGHH/wF2wMPPKCZM2fW2jbQyltycrIuHbVWUptGzx2slxZnhH1OhIfX51TWzv4anfy1XLznzTx951odgSW8Xq+ysrI0evRouVwuq8NBC0JuwUzkF8wSqtwqKipShw4dTCvebPGyydtvv1379+/XjTfeqD59+qikpKTa9WPHjvmPK69FRUUpKiqqQeP37NlTHTp0UEFBgbZv315n2+joaEVHR9c47y2PkHwRDZovlPilvuVzOX08z2Zq5T/8XS4XvwDBFOQWzER+wSzB5pbZeWmL4i03N1eS9Pjjj+vxxx+vs21iYqIk6ZZbbtHDDz9sdmh+6RmpcrvdYZvvPzwWzImw8HqlvNXHV4b4AQUAANDqsROCpJycHBUUHN/mPyUlxeJoAAAAAKAmWxRva9askWEYtX7Nnz/f37byXOWqW33vgTMMQ3PmzJEkOZ1OXXTRRaY9DgAAAABoKlsUb8HIy8vTkCFDtHTpUm3fvt1fzPl8Pn322WcaO3asXn31VUnHP4qgd+/eVoYLAAAAAAHZ4j1vwVq/fr3Wr18v6fiGI4mJiSouLq62a+TUqVP197//3aoQAQAAAKBOLb5469y5s9LS0vTpp5/qyy+/1M8//6yDBw8qJiZGKSkpOuecczRt2jQNHTrU6lABAAAAoFYtonjzeDzyeDwBr8XGxuqmm27STTfdZGoMUybkSL4CU+eoS2baCsvmhkl8TkkDpc0LJT4qAKHWmPwa4AlHRAAAoB4t/j1vAAAAANASULwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA20iN0mm4P0jFS53W4LI/BYODdM4fVKeaulvnMll8vqaNDSkF8AANgOK28AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYANsWBIiUybkSL4CU+fITFth6vhoZnxOSQOlzQslp8/qaNDSkF9oqgEeqyMAgFaLlTcAAAAAsAGKNwAAAACwAYo3AAAAALABijcAAAAAsAGKNwAAAACwAXabDJH0jFS53W6TZ/GYPD6aFa9Xylst9Z0ruVxWR4OWhvwCAMB2WHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2C3yRCZMiFH8hWEbb7MtBVhmwsW8TklDZQ2L5ScPquj+Y8BHqsjAAAAaJVYeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAbYLfJEEnPSJXb7Q7jjJ4wzgVLeL1S3mqp71zJ5bI6GgAAAFiMlTcAAAAAsAGKNwAAAACwAYo3AAAAALABijcAAAAAsAGKNwAAAACwAXabDJEpE3IkX4EpY2emrWhYwwEeU+YHAAAAYD1W3gAAAADABijeAAAAAMAGKN4AAAAAwAYo3gAAAADABijeAAAAAMAG2G0yRNIzUuV2u00a3WPSuAAAAADswvYrb3/729/kcDj8X3UpLi6Wx+NR//79lZCQoDZt2ujMM8/UAw88oGPHjoUpYgAAAABoPFuvvG3dulULFixoUNu8vDyNGDFCO3bskCTFxcWprKxMGzZs0IYNG7R8+XJlZ2erXbt2JkYMAAAAAE1j25U3n8+nadOm6ejRozr77LPrbFteXq6LL75YO3bsUNeuXZWVlaXDhw+rtLRUL774ohITE7Vp0yZNnDgxTNEDAAAAQOPYtnhLS0vT2rVrdfXVV2vMmDF1tl22bJm+/vprSdLKlSs1atQoSZLT6dQVV1yhpUuXSpJWr16t7OxscwMHAAAAgCawZfGWm5urO++8U263Ww899FC97ZctWyZJGjlyZMBVuiuvvFIpKSmSpGeffTa0wQIAAABACNjyPW/XX3+9Dh8+rMcee0wdO3ass21paak++eQTSdLYsWMDtnE4HDr//PP1+OOP65133mlSTFMm5Ei+gib1DSQzbUXIxmqQAZ7wzgcAAACgUWy38vbkk08qOztbo0aN0qRJk+pt/91338nn80mS+vXrV2u7ymt79uzRgQMHQhMsAAAAAISIrVbe8vPzNWfOHMXGxvrfp1af3bt3+4+7d+9ea7uq13bv3q327dsHbFdWVqaysjL/90VFRZIkV2SFpIoGxdQQXl+Y62qvN7zzoV7efz8nXp4bmID8glnILZiJ/IJZQpVbZuemrYq36dOn69ChQ1q0aJFOPvnkBvUpLi72H8fFxdXaruq1qn1+aeHChQE/nuDyaTvqHL+xVucNDNlYDZK3OrzzocGysrKsDgEtGPkFs5BbMBP5BbMEm1ulpaUhiiQw2xRvzz//vN544w396le/0syZMy2LY+7cudXmLyoqUnJysl7+Rw9JbUI2z0uLM0I2VoP0nRve+VAvr9errKwsjR49Wi6Xy+pw0MKQXzALuQUzkV8wS6hyq/JVeWaxRfG2d+9ezZgxQxEREXryyScVGdnwsBMTE/3HdVXCVa9V7fNL0dHRio6OrnHeWx4h+SIaHFd9XE5fyMZq2ITcAJsrl8vFDyiYhvyCWcgtmIn8glmCzS2z89IWxdvtt9+u/fv368Ybb1SfPn1UUlJS7fqxY8f8x5XXoqKiFBUVpW7duvmv5efna8CAAQHnyM/P9x9X7dNQ6Rmpcrvdje5XO08IxwIAAABgd7bYbTI3N1eS9PjjjysxMbHG18KFC/1tK8/ddtttkqRTTz1VTufxh/nNN9/UOkfltS5dutS6WQkAAAAAWMUWxVsw4uLiNHToUEnSW2+9FbCNYRh6++23JUljxowJW2wAAAAA0FC2KN7WrFkjwzBq/Zo/f76/beW5hx9+2H9u8uTJkqT3339f69atqzH+yy+/rO3bt0tSgz47DgAAAADCzRbFW7AmT56s/v37yzAMXXbZZcrOzpYk+Xw+vfzyy7r++uslSWPHjtW5555rZagAAAAAEJAtNiwJVmRkpF5//XWNHDlSO3bs0KhRoxQXFyefz6ejR49KkgYNGqTly5dbHCkAAAAABNYqijdJ6tGjh7766istWbJEr7zyinJzc+VyudS3b1/94Q9/0M0336yoqKgmjz9lQo7kKwhhxPXLTFsR1vkQZj6npIHS5oVSuD86Ai0f+VXTAI/VEQAAUKcW8bJJj8fjf69bXRITE7VgwQJ9/fXXKikpUVFRkTZs2KBZs2YFVbgBAAAAgNlaRPEGAAAAAC0dxRsAAAAA2ADFGwAAAADYAMUbAAAAANhAq9lt0mzpGalyu91hntUT5vkQVl6vlLda6jtXcrmsjgYtDfkFAIDtsPIGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADZA8QYAAAAANsBukyEyZUKO5CsI+biZaStCPmZIDPBYHQEAAADQqrDyBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADbAbpMhkp6RKrfbbcLIHhPGBAAAAGA3rLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIhMmZAj+QpMnSMzbUX9jQZ4TI0BAAAAgDVYeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2DDkhBJz0iV2+02eRaPyeMDAAAAaK5YeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAbYLfJEJkyIUfyFYRlrsy0FcENMMATkjgAAAAAhA8rbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGAD7DYZIukZqXK73WGazROmeQAAAAA0F7Zaedu4caMWLFig3//+9+rTp4/cbrdcLpfcbreGDh2qe++9VwcOHAjY1+PxyOFw1Pv1ww8/hPlRAQAAAED9bLXy9o9//EOPPvqo//uYmBjFxsbqwIEDWrt2rdauXauHH35Yr7/+us4+++yAY7hcLrVv377WOSIjbfVPAgAAAKCVsFWlMmTIEPXo0UPDhg1Tnz591LZtW0lSSUmJXnnlFc2ePVs///yzxo0bp23btqlNmzY1xjjnnHO0Zs2a8AYOAAAAAEGyVfE2adKkgOcTEhI0adIkdenSReedd5727dunVatW6eqrrw5zhAAAAABgDlu9560+v/71r/3Hu3btsjASAAAAAAgtW6281eejjz7yH6empoZ17ikTciRfgalzZKatCH6QAZ7gxwAAAAAQdrZfeSsrK9OOHTv0yCOP6JprrpEk9ezZUxdffHHA9ps3b1a/fv0UFxenhIQE9e7dW9dff702bdoUzrABAAAAoFFsu/IWExOjsrKyGueHDh2qF154QdHR0QH7FRQU6MCBA2rbtq2Kioq0bds2bdu2TU8//bTuuOMO3XPPPXXOW1ZWVm3eoqIiSZIrskJSRdMfUAN4fSGotb3e4MdAWHj//Vx5ec5gAvILZiG3YCbyC2YJVW6ZnZsOwzAMU2cwSY8ePXT06FGVlJTo8OHDkqSRI0fq/vvv1+DBg2u0X758uXbv3q1LLrlEKSkpcrlcOnbsmNasWaM77rhDX3zxhSRpyZIlmjVrVq3zejweLViwoMb5F154QXFxcSF6dAAAAADsprS0VFdddZUOHTqkpKSkkI9v2+Ktqn379um5557Tvffeq8LCQt111126++67G9z/6NGj+s1vfqP169crISFBu3btCvgxA1Lglbfk5GRdOmqtpMB9QuWlxRnBD9J3bvBjICy8Xq+ysrI0evRouVwuq8NBC0N+wSzkFsxEfsEsocqtoqIidejQwbTizbYvm6yqU6dOmjVrloYPH66zzz5bf/3rXzVkyBBddNFFDeofExOj++67T6NHj1ZJSYmys7M1fvz4gG2jo6MDviTTWx4h+SKCehz1cTl9IRiEG53duFwufkDBNOQXzEJuwUzkF8wSbG6ZnZctonirNGTIEA0bNkwffvihnnjiiQYXb5J09tln+4+3b9/e6LnTM1Lldrsb3a9xPCaPDwAAAKC5sv1uk7/UvXt3SdIPP/xgcSQAAAAAEDotrnirXDVLTExsVL/PPvvMf5ySkhLSmAAAAAAgWLYp3ioqKlTf3irZ2dn6/PPPJUkjRozwn6+vX1lZme68805JUnx8vM4999zgggUAAACAELNN8bZz504NGjRIS5cu1fbt26sVZDt37tTf/vY3XXLJJTIMQ+3bt9ett97qv/7hhx9q1KhReu6557Rr1y7/ea/Xq+zsbA0fPlzr1q2TJM2bN09t27YN2+MCAAAAgIaw1YYl//rXv/THP/5RkhQVFaWkpCQdOXLE/zlv0vGXPK5cuVJdunTxnzMMQ9nZ2crOzpYkxcbGKj4+XocOHfJ/kJ7T6dTtt9+u2267LYyPCAAAAAAaxjbFW7du3fTyyy9rzZo1WrdunXbv3q2CggJFREToxBNP1MCBA3XJJZfoqquuUmxsbLW+/fv315IlS/Tpp5/q66+/VkFBgQoLCxUXF6fTTjtNw4cP1w033KD+/fs3Ob4pE3IkX0GwD7NOmWkrGtZwgMfUOAAAAACEn22Kt6ioKE2YMEETJkxodF+3261Zs2aZEBUAAAAAhIdt3vMGAAAAAK0ZxRsAAAAA2ADFGwAAAADYAMUbAAAAANiAbTYsae7SM1LldrtNnsVj8vgAAAAAmitW3gAAAADABijeAAAAAMAGKN4AAAAAwAYo3gAAAADABijeAAAAAMAG2G0yRKZMyJF8BWGZKzNtRVjmgcV8TkkDpc0LJafP6mhCZ4DH6ggAAABsiZU3AAAAALABijcAAAAAsAGKNwAAAACwAYo3AAAAALABijcAAAAAsAF2mwyR9IxUud3uMM3mCdM8sJTXK+WtlvrOlVwuq6MBAACAxVh5AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABtgt8kQmTIhR/IVmDpHZtoKU8dHM+NzShoobV4oOX1WR4OWxsr8GuAJ73wAALQQrLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIikZ6TK7XabPIvH5PHRrHi9Ut5qqe9cyeWyOhq0NOQXAAC2w8obAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAxRsAAAAA2AC7TYbIlAk5kq8g7PNmpq0Iz0QDPOGZBwAAAEBArLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADdhqt8mNGzcqMzNTX3zxhbZt26aff/5ZRUVFSkpKUp8+fXTBBRfoxhtvVPv27WsdY+/evbr//vu1atUq/fjjj4qNjVXfvn01efJkXXvttXI4HE2KLT0jVW63u6kPLQgeC+YEAAAAEG62Kt7+8Y9/6NFHH/V/HxMTo9jYWB04cEBr167V2rVr9fDDD+v111/X2WefXaP/F198ofPOO0/79++XJCUkJKi4uFgff/yxPv74Y2VkZOj1119XVFRU2B4TAAAAADSErV42OWTIEC1evFiffvqpDh48qCNHjqioqEjFxcVatmyZOnbsqIKCAo0bN06HDh2q1vfQoUO66KKLtH//fvXp00fr169XcXGxDh8+rEceeUQul0tvv/22ZsyYYc2DAwAAAIA62Kp4mzRpkmbPnq1f//rXatu2rf98QkKCJk2apOeff16StG/fPq1atapa3yVLlmjPnj2KjY3V6tWrNXjwYElSVFSU/vznP2vBggWSpCeeeELbtm0LzwMCAAAAgAayVfFWn1//+tf+4127dlW79uyzz0qSrrzySqWkpNToe/PNNyshIUEVFRVavny5uYECAAAAQCPZ6j1v9fnoo4/8x6mpqf7jrVu36scff5QkjR07NmDfhIQEDR8+XG+++abeeecd/0pcQ02ZkCP5CpoQdXAy01aEfU6Eic8paaC0eaHk9FkdDZpqgMfqCAAAQAth+5W3srIy7dixQ4888oiuueYaSVLPnj118cUX+9t88803/uN+/frVOlbltW+//dakaAEAAACgaWy78hYTE6OysrIa54cOHaoXXnhB0dHR/nO7d+/2H3fv3r3WMSuvFRUVqaSkRAkJCTXalJWVVZu3qKhIkuSKrJBU0ejHESyvz/b1N2pR+dzyHNuc12t1BAF5/x2Xt5nGB/sit2Am8gtmCVVumZ2bti3eunTpoqNHj6qkpESHDx+WJI0cOVL333+/TjzxxGpti4uL/cdxcXG1jln1WnFxccDibeHChQFfUnn5tB11jm2W1XkDwz4nwitrZ3+rQ0Aw8lZbHUGdsrKyrA4BLRS5BTORXzBLsLlVWloaokgCs23xtmPHDv/xvn379Nxzz+nee+/VkCFDdNddd+nuu+82Zd65c+dq5syZ/u+LioqUnJysl//RQ1IbU+asy0uLM8I+J8LD63Mqa2d/jU7+Wi7e82ZffedaHUFAXq9XWVlZGj16tFwul9XhoAUht2Am8gtmCVVuVb4qzyy2Ld6q6tSpk2bNmqXhw4fr7LPP1l//+lcNGTJEF110kSQpMTHR37a0tFRJSUkBx6laKVftU1V0dHS1l2RW8pZHSL6IYB5Gk/BLfcvncvp4nu2smf9y4XK5+AUIpiC3YCbyC2YJNrfMzssWUbxVGjJkiIYNG6YPP/xQTzzxhL9469atm79Nfn5+rcVbfn6+JCkpKSngSybrkp6RKrfb3cTIg+GxYE6Ehdd7/CV3fec2+wIAAAAA5mtxOyFUbjryww8/+M9V3WGy6s6Tv1R57bTTTjMpOgAAAABomhZXvG3fvl1S9Zc99urVy7+JyVtvvRWw3+HDh/2fEzdmzBiTowQAAACAxrFN8VZRUSHDMOpsk52drc8//1ySNGLECP95h8OhSZMmSZJefPHFapudVHr00UdVUlKiiIgIXX311SGLGwAAAABCwTbF286dOzVo0CAtXbpU27dvr1bI7dy5U3/72990ySWXyDAMtW/fXrfeemu1/rNnz1aXLl1UWlqqCy+8UF988YUk6dixY3r88cf13//935KkG264Qb169QrfAwMAAACABrDVhiX/+te/9Mc//lGSFBUVpaSkJB05csT/OW+SlJKSopUrV6pLly7V+rZp00arVq3Seeedp2+//VaDBw9WYmKijh496v8wvTFjxuihhx4K3wMCAAAAgAayTfHWrVs3vfzyy1qzZo3WrVun3bt3q6CgQBERETrxxBM1cOBAXXLJJbrqqqsUGxsbcIwzzjhDmzdv1qJFi7Rq1Srt3LlT8fHx6tevnyZPnqxp06bJ6WzaYuSUCTmSryCYhxhQZtqKkI8Jm/A5JQ2UNi+U6vuogAGecEQEAAAAC9mmeIuKitKECRM0YcKEoMbp3LmzHnzwQT344IMhigwAAAAAzGeb97wBAAAAQGtG8QYAAAAANkDxBgAAAAA2QPEGAAAAADZgmw1Lmrv0jFS53W4TRvaYMCZsweuV8lZLfedKLpfV0QAAAMBirLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIhMmZAj+QpCPm5m2oqQj1mnAZ7wzgcAAACgQVh5AwAAAAAboHgDAAAAABsw/WWTX3/9td599105nU6dd9556tOnj9lTAgAAAECLE/TK23vvvaff/e53uuOOO2pce/DBBzVo0CDNnj1bM2fOVP/+/ZWWlhbslAAAAADQ6gRdvL388sv64IMP1KNHj2rnt23bpr/85S/y+XyKiopSbGysKioqdOutt2rTpk3BTgsAAAAArUrQL5tcu3atJGns2LHVzj/11FOqqKjQb3/7W61atUpRUVG6+uqrlZGRoccee0xPPvlksFM3K+kZqXK73SaM7DFhTAAAAAB2E/TK2759+xQREaETTjih2vm33npLDodD8+bNU3x8vFwulxYuXChJ+vDDD4OdFgAAAABalaCLtwMHDigpKUkOh8N/rri4WJs3b1Z8fLx++9vf+s+npqYqJiZGu3btCnZaAAAAAGhVgi7eYmJidOjQIRmG4T+3du1aGYahs846S05n9SliY2ODnRIAAAAAWp2gi7eePXvK5/Ppgw8+8J975ZVX5HA4NGzYsGptjx07pkOHDqlz587BTgsAAAAArUrQG5ZceOGF2rRpk6699lrdd999+umnn5Seni5JGj9+fLW2mzZtks/n04knnhjstAAAAADQqgRdvM2cOVPLli1Tbm6urrrqKkmSYRi64oor1L9//2ptX3vttYArci3BlAk5kq8grHNmpq0I63ymG+CxOgIAAACg2Qq6eGvbtq3Wrl2r+fPn69NPP1Xbtm110UUXac6cOdXaHTt2TP/4xz9kGIZGjhwZ7LQAAAAA0KoEXbxJUvfu3fXUU0/V2SYqKkp79uwJxXQAAAAA0OoEvWEJAAAAAMB8IVl5+6W8vDzt27dPktSpUyeddNJJZkwDAAAAAK1GyFbedu/erZtvvlmdOnXSySefrF//+tf69a9/rZNPPlkdO3bUzTffzIdzAwAAAEAThWTl7Z133tEVV1yhoqKiah/WXWn//v167LHH9Nxzz+nFF1/U+eefH4ppm5X0jFS53e4wz+oJ83wAAAAArBJ08bZ161aNGzdOR48eVfv27fXHP/5Rv/vd79S9e3dJUn5+vt5//30tXbpUBQUFGj9+vDZt2qTevXsHHTwAAAAAtBZBF29//etfdfToUQ0YMEBZWVnq2LFjteu9e/fW7373O91yyy0aNWqUvv76a91zzz167rnngp0aAAAAAFqNoN/zlp2dLYfDoaeeeqpG4VZVhw4d9OSTT8owDL377rvBTgsAAAAArUrQxVthYaESEhI0ePDgetueeeaZSkhIUGFhYbDTAgAAAECrEnTx1rVrV1VUVDS4vc/nU9euXYOdFgAAAABalaDf83bBBRfo8ccf13vvvaff/e53dbbNzs5WaWmpLrroomCnbXamTMiRfAVhnTMzbUVY5zPdAI/VEQAAAADNVtArb//93/+tTp066dprr9W2bdtqbff999/r+uuvV9euXXXXXXcFOy0AAAAAtCoh+aiAhQsX6tZbb9XAgQN1+eWXB/yogJdfflkxMTF66KGHtGXLFm3ZsqXGWL/5zW+CDQcAAAAAWqSgi7cRI0bI4XD4v1++fLmWL18esG1ZWZmmTZsW8JrD4VB5eXmw4QAAAABAixR08SZJhmE0izEAAAAAoKUKunjz+XyhiAMAAAAAUIeQrLyFy/79+/X6668rOztbGzduVF5ensrLy9WxY0cNHjxYkydP1qWXXhqwb3p6uqZOnVrvHFlZWRo1alSjY0vPSJXb7W50v+B4wjwfAAAAAKvYqnjr0qVLtffFxcTEyOVyKT8/X/n5+Xrttdc0duxYZWRkKC4uLuAYTqdTHTt2rHWO6OjokMcNAAAAAMEK+qMCwqm8vFxDhgzRY489ppycHB05ckQlJSXKzc3VtddeK0l68803NX369FrHSE5O1p49e2r9Gj58eLgeDgAAAAA0WKNW3p599llJUps2bXTJJZdUO9dYkyZNanSf9957TyNHjqxxvkePHnrqqacUGRmppUuX6vnnn9d9992n5OTkJsUGAAAAAM1No4q3KVOmyOFwqHfv3v7irfJcYzgcjiYVb4EKt6quvfZaLV26VJK0YcMGijcAAAAALUajircTTzxRDodD3bp1q3GuOYiJifEfV1RUWBgJAAAAAIRWo4q3HTt2NOicVdasWeM/7t+/f8A2P//8s8444wxt3bpVFRUV6tq1q8455xxdd911GjFiRJPnnjIhR/IVNLm/2TLTVlgdQngN8FgdAQAAABBSttqwpC6FhYVauHChJGn48OHq3bt3wHalpaXauHGjoqKi5PP5lJubq+XLl2vkyJGaNm1atd0sAQAAAKC5sNVHBdTG5/Ppmmuu0U8//aSYmBg98sgjNdp069ZN8+fP1/jx49W7d29FR0eroqJC69at0/z58/Xuu+/qmWeeUXx8vNLS0mqdq6ysTGVlZf7vi4qKJEmuyApJzfelml5fi6nTG8brtTqCoHn//Ri8LeCxoPkhv2AWcgtmIr9gllDlltm56TAMwzB1hjC4+eab/QXb008/rWnTpjWqv8/n0/jx4/Xaa6/J6XRqy5YtOuWUUwK29Xg8WrBgQY3zL7zwQq2fLQcAAACg5SstLdVVV12lQ4cOKSkpKeTjh6x4Ky4u1qpVq/TVV1/pwIEDdVadDodDTz/9dCim1ezZs/XAAw9Ikh566CHNmDGjSeP88MMP/oLtgQce0MyZMwO2C7TylpycrEtHrZXUpklzh8NLizOsDiG8+s61OoKgeb1eZWVlafTo0XK5XFaHgxaG/IJZyC2YifyCWUKVW0VFRerQoYNpxVtIXjaZnp6uW265RSUlJf5zgWpCh8MhwzBCVrzddttt/sJtyZIlTS7cJKlnz57q0KGDCgoKtH379lrbRUdHKzo6usZ5b3mE5Ito8vxmczl9VocQXi3ohu5yufgBBdOQXzALuQUzkV8wS7C5ZXZeBl28vf3227r22mtlGIZiYmJ09tlnq1u3boqMNPftdHPmzNGSJUskSffff79mzZpl6nz1Sc9IldvttjSGunmsDgAAAABAEIKusO6//34ZhqGzzz5br732mjp06BCKuOpU9aWS999/v+bMmRP0mDk5OSooOL7Vf0pKStDjAQAAAEAoBV28ffHFF3I4HEpPTw974bZkyZIGrbhVvlSzruuVBaDT6dRFF10UmmABAAAAIESC3j++vLxcCQkJte7OGEpV3+P24IMPNvilknl5eRoyZIiWLl2q7du3+9+P5/P59Nlnn2ns2LF69dVXJUnTp0+v9TPiAAAAAMAqQa+8paamauvWraqoqFBEhHkbdvz4449avHixpOOrY4sWLdKiRYtqbT979mzNnj3b//369eu1fv16Scc3HUlMTFRxcXG1nSOnTp2qv//97yY9AgAAAABouqCLt4kTJ+r222/Xm2++aerLDX0+X7XjvXv31tm+6s6XnTt3Vlpamj799FN9+eWX+vnnn3Xw4EHFxMQoJSVF55xzjqZNm6ahQ4eaFj8AAAAABCPo4m3GjBlauXKl/vSnP6l3796mvXyyR48eAT9+oCFiY2N100036aabbgpxVP8xZUKO5CswbfxAMtNWhHU+hJnPKWmgtHmh1No+6gGhN8BjdQQAACBIjSrenn322YDnr7nmGs2bN08DBw7UhAkTdNZZZykxMbHOsSZNmtSYqQEAAACgVWtU8TZlypR6d21cvny5li9fXuc4DoeD4g0AAAAAGqFRxduJJ55YZ/EGAAAAADBHo4q3HTt2mBQGAAAAAKAuQW9YguPSM1LldrvDPKsnzPMhrLxeKW+11Heu5HJZHQ0AAAAsZlrxduzYMb311lvaunWroqOjdfrpp2vYsGFmTQcAAAAALVqji7fi4mK9+uqrkqQrrrhC0dHRNdps2LBBl112mXbt2lXt/FlnnaVXXnlFXbp0aWK4AAAAANA6ORvbITs7W1OmTNHDDz8csHDbt2+fLrjgAu3atUuGYVT7WrdunX7/+9+HJHAAAAAAaE0aXbx99NFHkqSrrroq4PVFixapoOD4h1VPnjxZn3zyif71r3/p1ltvlWEY+uKLL5SRkRFEyAAAAADQ+jT6ZZOff/65HA6Hzj///IDXly9fLofDoYsvvljPPPOM//wDDzygAwcOaNmyZVq5cqUmTJjQ9KgBAAAAoJVpdPH2008/KTIyUqeddlqNa5s3b9a+ffvkcDj0//7f/6tx/ZZbbtGyZcu0adOmpkXbjE2ZkCP5CsI6Z2bairDOhzDzOSUNlDYvlJy+xvcf4Al1RAAAALBQo182uXfvXiUlJcnprNn1888/lyRFRUUF3FmyX79+cjgc2r17dxNCBQAAAIDWq9HFW0VFhYqKigJe++KLLyRJp556qqKiompcj4yMVLt27XTkyJHGTgsAAAAArVqji7dOnTqpvLxcOTk5Na59+umncjgcOvPMM2vtX1JSovj4+MZOCwAAAACtWqOLt9NPP12S9MQTT1Q7//333+vLL7+UJP32t78N2DcvL0/Hjh3TCSec0NhpAQAAAKBVa3Tx9oc//EGGYeihhx7S4sWLtXXrVmVnZ+vyyy+XYRiKj4/XxRdfHLDvhx9+KOn4e98AAAAAAA3X6N0mL7/8cj366KP68MMPdfvtt+v222/3X3M4HJo5c6YSExMD9n3ppZfkcDgCbmZid+kZqXK73WGe1RPm+RBWXq+Ut1rqO1dyuayOBgAAABZr9MqbJL322mu66KKLZBiG/0uSrrvuOs2bNy9gn++//15vvfWWJOmCCy5oYrgAAAAA0Do1euVNktq0aaPXX39dP/zwg/99bmeeeaZOOumkWvu4XC699tprcrlcOvnkk5sULAAAAAC0Vk0q3ir17NlTPXv2bFDbHj16qEePHsFMBwAAAACtVpNeNgkAAAAACC+KNwAAAACwgaBeNon/mDIhR/IVhH3ezLQVYZ8TYeJzShoobV4oOX2N7z/AE+qIAAAAYCFW3gAAAADABijeAAAAAMAGKN4AAAAAwAYo3gAAAADABijeAAAAAMAG2G0yRNIzUuV2uy2Y2WPBnAgLr1fKWy31nSu5XFZHAwAAAIux8gYAAAAANkDxBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2wG6TITJlQo7kK7A0hsy0FTVPDvCEPQ4AAAAAocfKGwAAAADYAMUbAAAAANgAxRsAAAAA2ADFGwAAAADYAMUbAAAAANgAu02GSHpGqtxut8VReCyeHwAAAIBZbLPytn//fj3zzDOaOHGiTjvtNMXHxys6OlonnHCCxo0bp1dffbXeMYqLi+XxeNS/f38lJCSoTZs2OvPMM/XAAw/o2LFjYXgUAAAAANA0tll569Kli8rLy/3fx8TEyOVyKT8/X/n5+Xrttdc0duxYZWRkKC4urkb/vLw8jRgxQjt27JAkxcXFqaysTBs2bNCGDRu0fPlyZWdnq127duF6SAAAAADQYLZZeSsvL9eQIUP02GOPKScnR0eOHFFJSYlyc3N17bXXSpLefPNNTZ8+PWDfiy++WDt27FDXrl2VlZWlw4cPq7S0VC+++KISExO1adMmTZw4MdwPCwAAAAAaxDbF23vvvad169bpxhtv1Mknn+w/36NHDz311FP+ou3555/Xzp07q/VdtmyZvv76a0nSypUrNWrUKEmS0+nUFVdcoaVLl0qSVq9erezs7HA8HAAAAABoFNsUbyNHjqzzeuXqmyRt2LCh2rVly5b5xzj77LNr9L3yyiuVkpIiSXr22WeDDRUAAAAAQs4273mrT0xMjP+4oqLCf1xaWqpPPvlEkjR27NiAfR0Oh84//3w9/vjjeuedd5o0/5QJOZKvoEl9g5GZtiLsc4bVAI/VEQAAAADNgm1W3uqzZs0a/3H//v39x9999518Pp8kqV+/frX2r7y2Z88eHThwwJwgAQAAAKCJWsTKW2FhoRYuXChJGj58uHr37u2/tnv3bv9x9+7dax2j6rXdu3erffv2AduVlZWprKzM/31RUZEkyRVZIakiYB8zeX0tpv4OzOu1OgLLeP/92L2t+N8A5iG/YBZyC2Yiv2CWUOWW2blp++LN5/Ppmmuu0U8//aSYmBg98sgj1a4XFxf7jwN9hECga1X7/NLChQu1YMGCGucvn7ajzvHNsjpvYNjnDKu81VZHYLmsrCyrQ0ALRn7BLOQWzER+wSzB5lZpaWmIIgnM9sXbLbfcolWrVkmSHn30UQ0YMMDU+ebOnauZM2f6vy8qKlJycrJe/kcPSW1MnTuQlxZnhH3OsOo71+oILOP1epWVlaXRo0fL5XJZHQ5aGPILZiG3YCbyC2YJVW5VvirPLLYu3mbPnu1faXvooYc0bdq0Gm0SExP9x3VVwlWvVe3zS9HR0YqOjq5x3lseIfkiGhR3KLmcvrDPGVbcmOVyufgBBdOQXzALuQUzkV8wS7C5ZXZe2rZ4u+222/TAAw9IkpYsWaIZM2YEbNetWzf/cX5+fq0rc/n5+QH7NFR6Rqrcbnej+wXPY8GcAAAAAMLNlrtdzJkzR4sXL5Yk3X///Zo1a1atbU899VQ5nccf5jfffFNru8prXbp0qXWzEgAAAACwiu2Kt9mzZ2vJkiWSjhduc+bMqbN9XFychg4dKkl66623ArYxDENvv/22JGnMmDEhjBYAAAAAQsNWxdvs2bOrvVSyvsKt0uTJkyVJ77//vtatW1fj+ssvv6zt27dLkiZNmhSiaAEAAAAgdGxTvFV9j9uDDz5Y50slf2ny5Mnq37+/DMPQZZddpuzsbEnHP2bg5Zdf1vXXXy9JGjt2rM4999zQBw8AAAAAQbJF8fbjjz/63+PmdDq1aNEidenSpdavypdVVoqMjNTrr7+uHj16KD8/X6NGjVJ8fLzi4+P1X//1XyoqKtKgQYO0fPlyKx4eAAAAANTLFrtN+ny+asd79+6ts31JSUmNcz169NBXX32lJUuW6JVXXlFubq5cLpf69u2rP/zhD7r55psVFRXV5BinTMiRfAVN7v9LmWkrQjaWZQZ4rI4AAAAAaDFsUbz16NFDhmEEPU5iYqIWLFigBQsWhCAqAAAAAAgfW7xsEgAAAABaO4o3AAAAALABijcAAAAAsAGKNwAAAACwAVtsWGIH6RmpcrvdIRzRE8KxAAAAANgdK28AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgA+w2GSJTJuRIvgLT58lMW2H6HGgmfE5JA6XNCyWnz+po0NKQXzBL1dz61TyrowGAFoWVNwAAAACwAYo3AAAAALABijcAAAAAsAGKNwAAAACwAYo3AAAAALABdpsMkfSMVLnd7jDM5AnDHGgWvF4pb7XUd67kclkdDVoa8gtmqZpbAICQYuUNAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyADUtCZMqEHMlXEJa5MtNWhGUeWMznlDRQ2rxQcvqsjgYtTWvMrwEeqyMAACAorLwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADbDbZIikZ6TK7XaHaTZPmOaBpbxeKW+11Heu5HJZHQ1aGvILAADbYeUNAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDdJkNkyoQcyVdgdRjVZKatsDoEBMPnlDRQ2rxQcvqsjgYtDfmFXxrgsToCAEA9WHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG2C3yRBJz0iV2+22Ooxf8FgdAILh9Up5q6W+cyWXy+po0NKQXwAA2I6tVt5KS0v15ptv6p577tH48eN10kknyeFwyOFwyOPx1NnX4/H429b19cMPP4TnwQAAAABAI9hq5e3zzz/XBRdcENQYLpdL7du3r/V6ZKSt/kkAAAAAtBK2q1TatWun008/3f916623as+ePQ3uf84552jNmjXmBQgAAAAAJrBV8TZ8+HAdOHCg2rnbb7/domgAAAAAIHxs9Z63iIgIq0MAAAAAAEvYauWtOZsyIUfyFVg2f2baCsvmhkl8TkkDpc0LJafP6mjQ0pBfMEtzz60BHqsjAIAms9XKWyhs3rxZ/fr1U1xcnBISEtS7d29df/312rRpk9WhAQAAAECtWt3KW0FBgQ4cOKC2bduqqKhI27Zt07Zt2/T000/rjjvu0D333FNn/7KyMpWVlfm/LyoqkiS5IiskVZgZep28vlZXh7d4lc8pzy3MQH7BLM0+t7xeqyNAELz/fv68PI8IsVDlltm52WqKt1NOOUX333+/LrnkEqWkpMjlcunYsWNas2aN7rjjDn3xxRe699571a5dO82aNavWcRYuXKgFCxbUOH/5tB2Ki4sz8yHUaXXeQMvmhrmydva3OgS0YOQXzNJscytvtdURIASysrKsDgEtVLC5VVpaGqJIAnMYhmGYOoPJevTooby8PM2fP7/eD+quzdGjR/Wb3/xG69evV0JCgnbt2qU2bdoEbBto5S05OVmXjlorKXCfcHhpcYZlc8McXp9TWTv7a3Ty13I1x/eNwNbIL5il2edW37lWR4AgeL1eZWVlafTo0XK5XFaHgxYkVLlVVFSkDh066NChQ0pKSgphhMe1mpW3usTExOi+++7T6NGjVVJSouzsbI0fPz5g2+joaEVHR9c47y2PkHzW7YbZLH9AIiRcTh/PL0xDfsEszTa3+IW/RXC5XBRvMEWwuWV2XlK8/dvZZ5/tP96+fXuj+6dnpMrtdocypEbyWDg3TOH1Hn95T9+5/LKB0CO/YBZyCwBM00zfTQwAAAAAqIri7d8+++wz/3FKSoqFkQAAAABATa2ieKtvT5aysjLdeeedkqT4+Hide+654QgLAAAAABrMdsXbwYMHVVBQ4P/y+Y6/Gbq0tLTa+ZKSEn+fDz/8UKNGjdJzzz2nXbt2+c97vV5lZ2dr+PDhWrdunSRp3rx5atu2bVgfEwAAAADUx3YblgwaNEh5eXk1zi9evFiLFy/2fz958mSlp6dLOr7ylp2drezsbElSbGys4uPjdejQIf8H6TmdTt1+++267bbbzH8QAAAAANBItivemqJ///5asmSJPv30U3399dcqKChQYWGh4uLidNppp2n48OG64YYb1L9/0z9QdMqEHMlXEMKo65eZtiKs8yHMfE5JA6XNC6XK7bYHeKyMCAAAABayXfG2Y8eORvdxu92aNWtW6IMBAAAAgDCx3XveAAAAAKA1ongDAAAAABugeAMAAAAAG6B4AwAAAAAbsN2GJc1Vekaq3G53mGf1hHk+hJXXK+WtlvrOlVwuq6MBAACAxVh5AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABtgt8kQmTIhR/IVmD5PZtoK0+dAM+FzShoobV4oOX3WxjLAY+38AAAAYOUNAAAAAOyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDdJkMkPSNVbrc7DDN5wjAHmgWvV8pbLfWdK7lcVkcDAAAAi7HyBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADbAbpMhMmVCjuQrCNt8mWkrwjZXnQZ4rI4AAAAAaBVYeQMAAAAAG6B4AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAbYLfJEEnPSJXb7Q7jjJ4wzgUAAADAaqy8AQAAAIANULwBAAAAgA1QvAEAAACADVC8AQAAAIANULwBAAAAgA2w22SITJmQI/kKwjpnZtqKsM6HMPM5JQ2UNi+UnL7j5wZ4rIwIAAAAFmLlDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgA1LQiQ9I1VutzvMs3rCPB/CyuuV8lZLfedKLpfV0QAAAMBitll5Ky0t1Ztvvql77rlH48eP10knnSSHwyGHwyGPx9OgMfbu3atZs2apd+/eio2NVfv27TV8+HA99dRTMgzD3AcAAAAAAEGwzcrb559/rgsuuKDJ/b/44gudd9552r9/vyQpISFBxcXF+vjjj/Xxxx8rIyNDr7/+uqKiokIVMgAAAACEjG1W3iSpXbt2OvfcczVnzhytWLFCXbp0aVC/Q4cO6aKLLtL+/fvVp08frV+/XsXFxTp8+LAeeeQRuVwuvf3225oxY4a5DwAAAAAAmsg2K2/Dhw/XgQMHqp27/fbbG9R3yZIl2rNnj2JjY7V69WqlpKRIkqKiovTnP/9ZRUVFuuOOO/TEE09oxowZ6tWrV8jjBwAAAIBg2GblLSIiosl9n332WUnSlVde6S/cqrr55puVkJCgiooKLV++vMnzAAAAAIBZbLPy1lRbt27Vjz/+KEkaO3ZswDYJCQkaPny43nzzTb3zzjtasGBBo+eZMiFH8hUEFWswMtNWWDY3TOJzShoobV4oOX1WR1O3AR6rIwAAAGjxbLPy1lTffPON/7hfv361tqu89u2335oeEwAAAAA0Votfedu9e7f/uHv37rW2q7xWVFSkkpISJSQkBGxXVlamsrIy//dFRUWSJFdkhaSKEETcNF5fi6/DW53K59QWz63Xa3UEaCTvv58zL88dQozcgpnIL5glVLlldm62+OKtuLjYfxwXF1dru6rXiouLay3eFi5cGPBllZdP21Hn+GZbnTfQsrlhrqyd/a0OoX55q62OAE2UlZVldQhoocgtmIn8glmCza3S0tIQRRJYiy/eQm3u3LmaOXOm//uioiIlJyfr5X/0kNTGsrheWpxh2dwwh9fnVNbO/hqd/LVczf09b33nWh0BGsnr9SorK0ujR4+Wy+WyOhy0IOQWzER+wSyhyq3KV+WZpcUXb4mJif7j0tJSJSUlBWxXtUqu2ueXoqOjFR0dXeO8tzxC8jV9R8xgNftf7tFkLqev+T+//AC1LZfLxS9AMAW5BTORXzBLsLlldl62+OKtW7du/uP8/Pxai7f8/HxJUlJSUq0vmaxLekaq3G5304IMCY+Fc8MUXu/xlyP2nUtxBAAAgJa/22TVHSar7jz5S5XXTjvtNNNjAgAAAIDGavHFW69evXTiiSdKkt56662AbQ4fPqyPPvpIkjRmzJiwxQYAAAAADdXiizeHw6FJkyZJkl588UXt2LGjRptHH31UJSUlioiI0NVXXx3mCAEAAACgfrYq3g4ePKiCggL/l893fBOH0tLSaudLSkqq9Zs9e7a6dOmi0tJSXXjhhfriiy8kSceOHdPjjz+u//7v/5Yk3XDDDerVq1d4HxQAAAAANICtirdBgwapY8eO/q+dO3dKkhYvXlzt/E033VStX5s2bbRq1Sq53W59++23Gjx4sH9jkj/96U86duyYxowZo4ceesiKhwUAAAAA9Wrxu01WOuOMM7R582YtWrRIq1at0s6dOxUfH69+/fpp8uTJmjZtmpzOpteyUybkSL6CEEbcMJlpK8I+p+0N8FgdAQAAANBotireAr1frTE6d+6sBx98UA8++GBoAgIAAACAMLHVyyYBAAAAoLWieAMAAAAAG6B4AwAAAAAboHgDAAAAABuw1YYlzVl6RqrcbrcFM3ssmBMAAABAuLHyBgAAAAA2QPEGAAAAADZA8QYAAAAANkDxBgAAAAA2QPEGAAAAADbAbpMhMmVCjuQrMG38zLQVpo0dcgM8VkcAAAAAtDisvAEAAACADVC8AQAAAIANULwBAAAAgA1QvAEAAACADVC8AQAAAIANsNtkiKRnpMrtdps4g8fEsQEAAAA0d6y8AQAAAIANULwBAAAAgA1QvAEAAACADfCeNwAAAKCRDMOQ1+uVz+ezOhSEgNfrVWRkpI4ePSrDMORyueRwOKwOqwaKNwAAAKCBKioqVFBQoOLiYnm9XqvDQYgYhqEuXbpo586dcjgccrlcSkxMVIcOHRQREWF1eH4UbyEyZUKO5CswbfzMtBWhHXCAJ7TjAQAAtHAVFRXauXOnysrK1KZNGyUkJCgiIqJZrtCgcXw+n0pKShQfHy/DMFRSUqLCwkIdOXJEycnJzaaAo3gDAAAAGqCgoEBlZWU68cQTFRsba3U4CCGfz6djx44pNjZWTqdTCQkJatOmjX788UcVFBSoc+fOVocoiQ1LAAAAgHoZhqHi4mK1adOGwq2ViI2NVVJSkoqLi2UYhtXhSKJ4AwAAAOrl9Xrl9XqVkJBgdSgIo8TERP9z3xxQvAEAAAD1qNxVsrm89wnhUfl8N5ddRSneAAAAgAZic5LWpbk932xYEiLpGalyu90mzuAxcWwAAAAAzR0rbwAAAABgAxRvAAAAAGADFG8AAAAAYAO85w0AAAAIpa88VkcQGgM8pg5vGIYyMjL0wgsvaOPGjdq3b58iIiLUuXNnde3aVUOGDNHw4cN17rnnKikpyd9vypQpWrZsWbWxqn6w9sknn6xBgwbpvPPO05gxY+R0tpz1Koo3AAAAAGFVWFiocePG6YMPPvCfi4yMVFxcnH788Udt375dn3zyiR566CE988wzmjJlSo0xnE6nOnbs6P/+8OHD2rlzp3bu3KkPPvhADz/8sJKTk/XQQw/psssuC8fDMh3FW4hMmZAj+QpMGz8zbUVoBzT5LykAAABAbSZNmqQPPvhAERERmjFjhqZPn67U1FQ5nU6Vl5fr22+/1VtvvaUXXnih1jGSk5O1Y8eOaueOHTumr776Sm+88YYef/xx7dy5UxMmTNDcuXN13333mfyozNdy1hABAAAANHvff/+9MjMzJUn33HOPlixZolNOOcX/8sbIyEgNGDBAt912m7788ktdccUVDR47KipKgwcP1vz587V582aNHDlSkrRw4cI6C0G7oHgDAAAAEDZffvml//iSSy6pt31sbGyT5nG73XrllVfUvXt3SdJdd90lr9fbpLGaC4o3AAAAAJbYtWuXqeO3bdtWM2bMkCTl5ubqo48+MnU+s1G8AQAAAAibM888Uw6HQ5I0a9Ysbdu2zdT5LrzwQv9x1Q1S7IjiDQAAAEDY9OjRQ9ddd50k6euvv1afPn10+umn689//rP+8Y9/6JtvvpFhGCGbr0+fPoqKipIk5eTkhGxcK7Sq3SbT09M1derUettlZWVp1KhRjRs7I1Vut7upoTWAx8SxAQAAgPB57LHH1KVLFz344IM6fPiwNm3apE2bNvmvd+rUSVdffbX+8pe/qHPnzkHN5XA41K5dO+3du1cHDhwINnRLtcqVN6fTqc6dO9f6FR0dbXWIAAAAQIsVGRmpu+++W/n5+Xruued03XXXaeDAgf4Vsn379umhhx5Sv3799Pnnn1scbfPRqlbeKgX6TAgAAAAA4dWmTRtNnDhREydOlCQdPXpUH3/8sf7+978rMzNTBQUFuuyyy/T9998rJiamSXMYhqHCwkJJMvmVcuZrlStvAAAAAJqfmJgYjRo1Sq+//romT54s6fiOlG+99VaTx9yyZYvKysokSampqSGJ0yoUbwAAAACanRtuuMF/vHXr1iaP88Ybb/iPR4wYEUxIlqN4AwAAANDsJCQk+I+buidFYWGh/ud//kfS8VW3YcOGhSQ2q7TK97z9/PPPOuOMM7R161ZVVFSoa9euOuecc3Tdddc1uRqfMiFH8hWENtBfyExbYer4TTbAY3UEAAAAsInc3Fx5vV716tWrznbLli3zH59++umNnufAgQO6/PLL/R8Efu+99yoy0t7lT6tceSstLdXGjRsVFRUln8+n3NxcLV++XCNHjtS0adNUXl5udYgAAABAi7R582adeuqpuvDCC/Xss89W20jQ6/Vq06ZNmjp1qh588EFJ0pAhQxq8Yub1evXFF1/o7rvv1mmnnab33ntPknTXXXfpiiuuCPljCTd7l56N1K1bN82fP1/jx49X7969FR0drYqKCq1bt07z58/Xu+++q2eeeUbx8fFKS0sLOEZZWZn/DY+SVFRUJElyRVZIqjA1fq+vmdbaXq/VEbRI3n//u3r594UJyC+YhdyCmazML6/XK8Mw5PP55PP56mzrCOEHTFvJqOdxNlVERIR8Pp9Wr16t1atXS5KioqKUkJCggwcPVvuA7tNPP10rV66UJP+/e+X1nTt3qkuXLv62R44cUXFxcbX+J554oh566CGNGzeuzuetsk/lc1zJ5/PJMAx5vV5FRETU+9jMzk2HEcqPL7cxn8+n8ePH67XXXpPT6dSWLVt0yimn1Gjn8Xi0YMGCGudfeOEFxcXFhSNUAAAAhFlkZKS6dOmi5ORk/2eR1SZm29/CFJW5jva63bSxt2/frqysLH322Wf67rvvtHv3bh0+fFixsbHq0qWLBgwYoIsuukjjxo2T01l9AeNPf/qTVqyo/nYih8OhhIQEJSYmqkePHho4cKDOPfdcjRw5skb/xjh27Jh27typPXv2NOjVeaWlpbrqqqt06NAhJSUlNXne2lC8VfHDDz/4C7YHHnhAM2fOrNEm0MpbcnKyLh21VlIbU+N7aXGGqeM3Wd+5VkfQInm9XmVlZWn06NFyuVxWh4MWhvyCWcgtmMnK/Dp69Kh27typHj16NPnzxtB8GYah4uJiJSYmyuFw+M8fPXpUO3bsUHJycoOe96KiInXo0MG04q1VvWyyPj179lSHDh1UUFCg7du3B2wTHR0dcLcbb3mE5Kt/KTUYLqc5S9dB44ezqVwuF78AwTTkF8xCbsFMVuRXRUWFHA6HnE5nUCs5aJ4qXypZ+RxXcjqdcjgcDc45s/OS4i1E0jNSw/CJ7R6TxwcAAADQXPFngypycnJUUHB8u/+UlBSLowEAAACA/2g1xVt9b+0zDENz5syRdHx59KKLLgpHWAAAAADQIK2meMvLy9OQIUO0dOlSbd++3V/M+Xw+ffbZZxo7dqxeffVVSdL06dPVu3dvK8MFAAAAgGpa1Xve1q9fr/Xr10s6vvFIYmKiiouLq+0eOXXqVP3973+3KkQAAAAACKjVFG+dO3dWWlqaPv30U3355Zf6+eefdfDgQcXExCglJUXnnHOOpk2bpqFDhzZp/CkTciRfQYijbrrMtBX1NwqFAZ7wzAMAAAC0cq2meIuNjdVNN92km266yepQAAAAAKDRWs173gAAAADAzijeAAAAAMAGKN4AAAAAwAYo3gAAAADABlrNhiVmS89IldvttjqMKjxWBwAAAAAghFh5AwAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAEFYej0cOh6PGV0xMjE444QT9/ve/1z//+U8ZhlGtX3p6esB+CQkJ6tatmwYPHqzrrrtOy5Yt0+HDhy16dOZht8kQmTIhR/IVWB1GSGSmrbA6BEiSzylpoLR5oeT0WR0NWhryC2Yht2CmuvJrgMeKiAK6+GKrIwiNzMzwzNO5c2f/8aFDh5Sfn6/8/HxlZmYqPT1dr776qqKjo2v069ChgyIiIiRJx44d0969e/XTTz/piy++0NNPP62bbrpJc+bM0R133KHIyJZR9rDyBgAAAMAye/bs8X8dPnxY33zzjUaPHi1JevPNN3XXXXcF7Ld+/Xp/vwMHDqi8vFxbtmzR//7v/6p///4qKSnR/PnzNXr0aJWVlYXzIZmG4g0AAABAs+B0OtW3b1+9/vrr6tmzpyRp6dKlKi8vr7evw+FQ7969NX36dG3atEkzZ86UJK1Zs0b/7//9P1PjDheKNwAAAADNSkxMjC6//HJJUnFxsbZs2dKo/hEREXrggQd04YUXSpKefvppbdu2LeRxhhvFGwAAAIBm54QTTvAfFxUVNWkMj8cjSaqoqNDzzz8firAsRfEGAAAAoNnZsWOH/7h9+/ZNGmPw4MHq1KmTJOmDDz4IRViWahnbrjQD6RmpcrvdVocRIh6rA4Akeb1S3mqp71zJ5bI6GrQ05BfMQm7BTORXq1FUVKTly5dLOl649erVq8ljDRw4UFlZWcrJyQlVeJZh5Q0AAABAs1BYWKjs7Gz97ne/0+7duyVJt9xyi5zOppctlat2Bw4cCEmMVmLlDQAAAIBlHA5HrdcmTpyoO++8M4zRNG8UbwAAAAAsU/VDuqOjo9WhQwcNGjRIV199tUaOHBn0+JUrbi3hLU4UbwAAAAAss2fPHlPH/+qrryRJqampps4TDrznDQAAAECLtGHDBu3du1eSNGLECGuDCQFW3kJkyoQcyVcQ1jkz01aEdT6Emc8paaC0eaHk9B0/N8BjZUQAAAC2smDBAknHP7R74sSJFkcTPFbeAAAAALQoFRUVmjVrllatWiVJuv7669WzZ0+LowoeK28AAAAAbM8wDP3www96//339eijj/rf63buuefqf/7nfyyOLjQo3gAAAADYzplnnqmIiAhJktfr1aFDh1RRUeG/npiYqL/85S/6y1/+osjIllH2tIxHAQAAAKBVKSj4z34TcXFx6tixo7p3765f/epX+s1vfqPLLrtM8fHxFkYYehRvAAAAQAhlZlodQfPn8Xjk8Xga3W/KlCmaMmVKyOOxC4q3EEnPSLXgg/88YZ4PYeX1Snmrpb5zJZfL6mgAAABgMXabBAAAAAAboHgDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABtgt8kQmTIhR/IV1N/QBJlpKyyZFybzOSUNlDYvlJw+q6MJ3gCP1REAAADYGitvAAAAAGADFG8AAAAAYAMUbwAAAEADGYZhdQgIo+b2fFO8AQAAAPVwOo//2lxRUWFxJAinyue78vm3WvOIAgAAAGjGXC6XXC6XSkpKrA4FYVRcXOx/7psDdpsMkfSMVLndbotm91g0L0zl9Up5q6W+c6VmcsMAAKC1cjgcSkxMVGFhodq0aaPY2FirQ4LJjhw5oqKiIrVt21YOh8PqcCS1wuKtuLhYDzzwgFauXKnc3FxFRESoV69euvLKK3XzzTcrKirK6hABAADQDHXo0EFHjhzRjz/+qKSkJCUmJioiIqLZ/GKPpvP5fDp27JiOHDkiwzBUXFysoqIiRUdHq0OHDlaH59eqire8vDyNGDFCO3bskCTFxcWprKxMGzZs0IYNG7R8+XJlZ2erXbt21gYKAACAZiciIkLJyckqKChQcXGxCgsLrQ4JIWIYho4cOaLY2Fg5HA65XC61bdtWHTp0UEREhNXh+bWa4q28vFwXX3yxduzYoa5du+rZZ5/VqFGj5PP59PLLL+v666/Xpk2bNHHiRL3xxhtWhwsAAIBmKCIiQp07d1anTp3k9Xrl8/msDgkh4PV69eGHH+q3v/2toqKi5HK5muWKaqsp3pYtW6avv/5akrRy5UqdffbZko7vHHPFFVfI5/Ppqquu0urVq5Wdna1zzz3XynABAADQjDkcDt5u04JERESovLxc0dHRzWZzkkBazW6Ty5YtkySNHDnSX7hVdeWVVyolJUWS9Oyzz4Y1NgAAAACoT6tYeSstLdUnn3wiSRo7dmzANg6HQ+eff74ef/xxvfPOO42eY8qEHMlXEFSclTLTVoRkHNiczylpoLR5oeQ0+SUZAzzmjg8AAICgtYqVt++++87/euR+/frV2q7y2p49e3TgwIGwxAYAAAAADdEqVt52797tP+7evXut7ape2717t9q3b1+jTVlZmcrKyvzfFxUVSZJckRWSKkIQreT1tYqaGvWozIOw5IPXa/4caFa8/37OvTz3CDFyC2Yiv2CWUOWW2bnZKoq34uJi/3FcXFyt7apeq9qnqoULF2rBggU1zl8+bUedYzfG6ryBIRkHLUPWzv7mT5K32vw50CxlZWVZHQJaKHILZiK/YJZgc6u0tDREkQTWKoq3UJo7d65mzpzp/76oqEjJycl6+R89JLUJyRwvLc4IyTiwN6/Pqayd/TU6+Wu5zH7PW9+55o6PZsfr9SorK0ujR49u1rtqwX7ILZiJ/IJZQpVbla/KM0urKN4SExP9x3VVw1WvVe1TVXR0tKKjo2uc95ZHSL7QfICf6b+ow1ZcTp/5OcEPwFbL5XLxCxBMQW7BTOQXzBJsbpmdl62ieOvWrZv/OD8/XwMGDAjYLj8/P2CfhkjPSJXb7W5agDV4QjQObM3rPf5yxr5zKa4AAADQOnabPPXUU+V0Hn+o33zzTa3tKq916dIl4GYlAAAAAGCVVrHyFhcXp6FDh+qjjz7SW2+9pTlz5tRoYxiG3n77bUnSmDFjGjy2YRiSjm9wwvI9Qsnr9aq0tFRFRUXkFkKO/IJZyC2YifyCWUKVW5XveausEUKtVRRvkjR58mR99NFHev/997Vu3TqdddZZ1a6//PLL2r59uyRp0qRJDR53//79kqSUlJTQBQsAAADAtoqLi9WmTWg2M6zKYZhVFjYz5eXlOv300/X111+re/fuWrZsmc4991z5fD6tXLlS1113nYqKijR27FitXt3wbdMLCwvVrl07/fjjj6Y8QWi9Kncy3blzp5KSkqwOBy0M+QWzkFswE/kFs4QqtwzDUHFxsbp16+Z/21YotZqVt8jISL3++usaOXKkduzYoVGjRikuLk4+n09Hjx6VJA0aNEjLly9v1LiVT0qbNm24icAUSUlJ5BZMQ37BLOQWzER+wSyhyC0zF3RaxYYllXr06KGvvvpK8+bNU79+/eRwOORyuXTGGWdoyZIl+uyzz9SuXTurwwQAAACAGlrNylulxMRELViwQAsWLLA6FAAAAABosFa18maG6OhozZ8/P+AHdwPBILdgJvILZiG3YCbyC2axS261mg1LAAAAAMDOWHkDAAAAABugeAMAAAAAG6B4AwAAAAAboHgDAAAAABugeGui4uJieTwe9e/fXwkJCWrTpo3OPPNMPfDAAzp27JjV4cEEpaWlevPNN3XPPfdo/PjxOumkk+RwOORwOOTxeBo0xt69ezVr1iz17t1bsbGxat++vYYPH66nnnpKDdk7KCcnR9OnT1dKSopiYmLUsWNHnXfeeVq5cmWD5t+4caMmTpyoE044QdHR0eratasuvfRSvffeew3qD/Ps379fzzzzjCZOnKjTTjtN8fHxio6O1gknnKBx48bp1VdfrXeMYO9LVucnzLFx40YtWLBAv//979WnTx+53W65XC653W4NHTpU9957rw4cOFDnGFbnBvcue/nb3/7m//nocDjqbMt9C7VJT0+vlke1fb377ru1jmH1vef999/XpZdeqq5du/p/pk+cOFEbN25sUP+ADDTajh07jB49ehiSDElGXFycER0d7f9+0KBBxoEDB6wOEyH2/vvv+5/jX37Nnz+/3v4bNmww3G63v09CQoIRGRnp//68884zysrKau3/xhtvGHFxcf72SUlJhtPp9H8/depUw+fz1dr/ySefrDZfmzZtDIfD0ajHAPNUfW4kGTExMUZ8fHy1c2PHjjUOHz4csH+w9yWr8xPm+fOf/1wjtxITE6ud69Chg7F27dqA/a3ODe5d9rJlyxYjJiamWn7VhvsW6vLMM88Ykgyn02l07ty51q8PP/wwYH+r7z3z58/3t3U4HEabNm3830dGRhpPPvlkk/5dKN4ayev1Gv379zckGV27djWysrIMwzCMiooK48UXX/T/QLzgggssjhSh9v777xvt2rUzzj33XGPOnDnGihUrjC5dujToP3BhYaG/bZ8+fYz169cbhmEYZWVlxiOPPGK4XC5DknHjjTcG7L99+3b/L/JDhw41tm7dahiGYRQXFxvz5s3z3wwWLVoUsP/atWuNiIgIQ5Ixbtw4Y+fOnYZhGEZBQYExffp0f/+XXnqpif86CJYkY8iQIcZjjz1m5OTk+M/n5uYa1157rf85mjhxYo2+wd6XrM5PmGvZsmXG4sWLjU8//dQ4ePCg/3xxcbGxbNkyo2PHjoYko1OnTkZhYWG1vlbnBvcue6moqDDOOeccQ5Jx9tln11m8cd9CfSqLt5NOOqnRfa2+97z00kv+NtOnTzcKCgoMwzCMnTt3GuPGjTMkGREREbX+0awuFG+N9NRTT/mfjED/4C+88IL/+rvvvmtBhDBLeXl5jXMnnXRSg4q3u+66y5BkxMbGGtu3b69x/b777vP/R668wVQ1ceJEQ5LRpUuXar98Vbrhhhv8f1UK9FfKYcOGGZKM/v37G8eOHatx/bzzzjMkGT169Aj4OGG+9957r87rVX9Y/Pjjj9WuBXtfsjo/Ya23337bnx/PP/98tWtW5wb3Lnt5+OGHDUnG1VdfXW3VIRDuW6hPMMWblfee8vJy/++H559/fo2+ZWVlRr9+/QxJxrBhwxr92CjeGmn48OGGJGPkyJEBr/t8PiMlJcWQZEyaNCnM0SHcGlq8nXjiif4l+kCKi4uNhIQEQ5Ixb968atdKSkqM2NhYQ5KxYMGCgP1zc3P9P+T+8Y9/VLuWk5Pjv7Zs2bKA/desWeNvU18RAWt8/vnn/ufolVdeqXYt2PuSlfkJ6x06dMj//Pztb3+rdo17FxqqcqXD7XYb+/btq7d4476F+jS1eLP63pOdne2/9sEHHwTsn56e7m8T6I8PdWHDkkYoLS3VJ598IkkaO3ZswDYOh0Pnn3++JOmdd94JW2xovrZu3aoff/xRUu15k5CQoOHDh0uqmTcff/yxjhw5Umf/Hj166NRTTw3YPysry39cmZu/NGzYMCUmJgbsj+YhJibGf1xRUeE/Dva+ZHV+wnofffSR/zg1NdV/bHVucO+yl+uvv16HDx/Wgw8+qI4dO9bZlvsWzGT1vaeyf2JiooYOHRqwf9W4GptfFG+N8N1338nn80mS+vXrV2u7ymt79uypdwcvtHzffPON/7ghefPtt98G1X/z5s0B+3fq1EmdOnUK2DciIkJ9+vQJ2B/Nw5o1a/zH/fv39x8He1+yOj9hjbKyMu3YsUOPPPKIrrnmGklSz549dfHFF/vbWJ0b3Lvs48knn1R2drZGjRqlSZMm1due+xYa4+eff9YZZ5yhhIQExcbG6uSTT9bEiROr/Vysyup7T2X/U089VREREQH7d+rUyf9HjsbmF8VbI+zevdt/3L1791rbVb1WtQ9ap8bmTVFRkUpKSmr0b9eunWJjY+vt/8ucq/y+rrnr6g/rFRYWauHChZKk4cOHq3fv3v5rwd6XrM5PhFdMTIwcDodiYmKUkpKim2++WQcPHtTQoUOVnZ2t6Ohof1urc4N7lz3k5+drzpw5io2N1dKlSxvUh/sWGqO0tFQbN25UVFSUfD6fcnNztXz5co0cOVLTpk1TeXl5tfZW33vMvndRvDVCcXGx/zguLq7WdlWvVe2D1inYvKk8rqtv1eu/zLlg+8NaPp9P11xzjX766SfFxMTokUceqXY9VPkVbH/yyx66dOmizp07Kz4+3n9u5MiRevjhh3XiiSdWa2t1bpBb9jB9+nQdOnRIHo9HJ598coP6cN9CQ3Tr1k3z58/Xv/71Lx09elQHDhzwv+R21KhRkqRnnnlGt956a7V+Vt97zM4vijcAaMZuueUWrVq1SpL06KOPasCAARZHBDvbsWOH9uzZo5KSEu3du1dLlizRl19+qSFDhmjevHlWhwebef755/XGG2/oV7/6lWbOnGl1OGhhxowZI4/HowEDBvhfFRAREaFzzjlHb7/9ti655BJJ0mOPPabvv//eylDDiuKtESrfmCgdX8KtTdVrVfugdQo2byqP6+pb9fovcy7Y/rDO7Nmz/SttDz30kKZNm1ajTajyK9j+5Jf9dOrUSbNmzdJbb70lh8Ohv/71r/4/FEjW5wa51bzt3btXM2bMUEREhJ588klFRkY2uC/3LQTL6XRqyZIlko6/QiUzM9N/zep7j9n5RfHWCN26dfMf5+fn19qu6rWqfdA6NTZvkpKSlJCQUKP/wYMH/bsn1dX/lzlX+X1dc9fVH9a47bbb9MADD0iSlixZohkzZgRsF+x9yer8hPWGDBmiYcOGSZKeeOIJ/3mrc4N7V/N2++23a//+/brhhhvUp08flZSUVPs6duyYv+0vz3HfQij07NlTHTp0kCRt377df97qe4/Z9y6Kt0Y49dRT5XQe/yerupPNL1Ve69Kli9q3bx+W2NB8Vd3pqCF5c9pppwXVv2/fvgH779u3Tz///HPAvhUVFdqyZUvA/gi/OXPmaPHixZKk+++/X7Nmzaq1bbD3JavzE81D5Rvnf/jhB/85q3ODe1fzlpubK0l6/PHHlZiYWOOrcpMlSf5zt912myTuWzCX1feeyv7fffddtY/2qarq2I3NL4q3RoiLi/N/XsNbb70VsI1hGHr77bclHX+tLtCrVy//RgC15c3hw4f9n7X0y7wZNmyYf7ek2vrn5eXpu+++C9h/9OjR/uPa+n/yySf+N8ySt9aaPXu2/6Ug999/v+bMmVNn+2DvS1bnJ5qHyr9aV335jtW5wb2r5eK+hVDIyclRQUGBJCklJcV/3up7T2X/4uJirV27NmD/quM2Or8a9ZHeMJ566ilDkuFwOIzPPvusxvWXXnrJ/4np7777rgURIpxOOukkQ5Ixf/78OtvdddddhiQjLi7OyM3NrXF90aJFhiQjIiLC2Lp1a43rEydONCQZXbt2NQoLC2tcv/HGGw1JRmJionHgwIEa14cNG2ZIMgYOHGgcO3asxvWxY8cakoyTTjrJKC8vr/OxwDyzZs3y3z+WLFnS4H7B3peszk+Yp7y83PD5fHW2effddw2Hw2FIMm677bZq16zODe5d9jV//nz/fScQ7luoS333LZ/PZ1x66aWGJMPpdBpbtmypdt3Ke095ebn/98MLLrigRt9jx44ZAwYMMCQZw4YNq/NxBkLx1kher9fo37+/Icno3r27/4ZSUVFh/POf/zSSkpIMScbYsWMtjhRmOHDggPHzzz/7v5KTkw1Jxpw5c6qdLy4urtavsLDQ6NKliyHJOO2004wNGzYYhmEYZWVlxmOPPWZERUUZkowbb7wx4Lzbt2834uPjDUnG8OHDjW3bthmGYRglJSXGggUL/L94LVq0KGD/Tz75xIiIiDAkGePHjzd27dplGIZh7N+/338Dk2S89NJLofqnQiPNmTPH/zw8+OCDjeob7H3J6vyEeXJzc42BAwca//u//2vk5ORU+4Xoxx9/NBYuXOh/7tq3b2/89NNP1fpbnRvcu+yrvuKN+xbqkpuba5x55pk17l0VFRXGp59+apx33nn+/Ar0HFt976n6x4cbb7zR2L9/v2EYhrFr1y5j/Pjx/j8srF27ttH/NhRvTZCbm2v06NHD/6TExcUZMTEx/u8HDRrEX2laqMq/pNT3NXny5Bp9N2zYYLjdbn+bxMREw+Vy+b8fM2aMcfTo0VrnfuONN4y4uDh/+zZt2vhvLJKMqVOn1vmXqieffNKIjIz0t2/btq3/5tWQ1UOYJy8vz/88OJ1Oo3PnznV+LV68uMYYwd6XrM5PmCM3N7favSkqKsro0KGD/5eayq+UlBRj48aNAcewOje4d9lTfcWbYXDfQu1+ee+Kjo42OnToYERHR1c7P3XqVMPr9QYcw+p7T9X/Aw6Hw2jbtq3/+8jISOPJJ59s0r8NxVsTFRUVGfPmzTP69etnxMfHG4mJicYZZ5xhLFmyxCgrK7M6PJgkmOLNMAxjz549xq233mqccsopRkxMjNG2bVtj2LBhxpNPPmlUVFTUO/8PP/xgXH/99UaPHj38N7LRo0cbGRkZDYr/iy++MK666iqje/fuRlRUlNG5c2dj3LhxRnZ2dmP+GRBiv/whVd9XbT8wgr0vWZ2fCL2ysjLj5ZdfNv785z8bgwcPNrp162ZERUUZsbGxxoknnmhcfPHFxlNPPWWUlpbWOY7VucG9y34aUrwZBvctBFZaWmqkpaUZV111lXHaaacZHTt2NCIjI42EhASjT58+xrRp04yPP/643nGsvvdkZ2cb48aNMzp37mxERUUZ3bt3N6666ir/SnFTOAzDMAQAAAAAaNbYbRIAAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbIDiDQAAAABsgOINAAAAAGyA4g0AAAAAbCDS6gAAAGiMiooKrVy5UqtWrdJnn32mffv2qbS0VG3btlWvXr00fPhwXX311erXr5/VoYbEl19+qf/7v/9T27ZtNWPGDKvDAQBYyGEYhmF1EAAANMRnn32myZMna9u2bf5zLpdLiYmJKiwslM/n858fP368VqxYoaioKCtCDZn09HRNnTpVJ510knbs2GF1OAAAC/GySQCALWRmZmrEiBHatm2b3G63Fi5cqG3btunYsWPav3+/jh07pvXr1+v2229XUlKSXnnlFZWWllodNgAAIcPLJgEAzd7333+viRMnqqysTKeddprefvttnXDCCdXaREREaPDgwRo8eLDmzJmjadOmWRQtAADmYOUNANDs3XXXXSoqKlJMTIxeffXVGoXbL7Vv317/93//pzZt2lQ7v2fPHs2ZM0d9+/ZVfHy84uPj1bdvX912223au3dvwLHWrFkjh8Mhh8NR55yVbdasWVNn/x9++EHTpk1TcnKyoqOjdcIJJ+j6669Xfn5+wDGnTp0qScrLy/OPU/nl8XiqtX/77bc1fvx4nXDCCYqKilJSUpJOPvlkjRkzRkuWLNGBAwfqfAwAgOaNlTcAQLO2d+9eZWRkSJKuvvpq9erVq8F9qxZcH3zwgcaNG6fCwkJJUnx8vCTp22+/1bfffqunnnpKr7/+uoYNGxa64H/h/fff1+9//3uVlJQoMTFRPp9P+fn5euqpp7R69Wp9/vnn6t69u799586ddeTIERUVFcnpdKpjx47VxktISPAf33333Zo/f77/+7i4OBmGodzcXOXm5iorK0uDBw/WiBEjTHt8AABzsfIGAGjW3n//ff9GJJdeemmTxti5c6e/cDvttNP08ccfq6SkRCUlJfrwww/Vu3dvHTx4UJdccknAFbBQueyyy/S73/1O3333nYqKinT48GG99NJLSkxM1O7duzV37txq7ffs2aP/+Z//kSQlJydrz5491b5mz54t6fiq3IIFCyRJM2fOVH5+vg4fPqzi4mIVFhbqo48+0p/+9CclJiaa9tgAAOajeAMANGubN2/2Hw8aNKhJY9x3330qLCxUu3btlJ2draFDh/qvDR8+XO+++66SkpJ04MABLVy4MOiYa/OrX/1Kr776qvr06SNJioqK0n/913/p3nvvlSRlZGSovLy80eOuW7dOPp9PvXr10gMPPKBu3br5r7Vp00bDhg3To48+qjPOOCM0DwQAYAmKNwBAs7Z//37/cfv27Rvd3zAM/fOf/5Qk/fGPf1SXLl1qtDnhhBP0xz/+UZL04osvNjHS+t1xxx1yOmv+6L3kkkskSUeOHNH333/f6HHbtm0rSSouLtbhw4eDihEA0HxRvAEAWrTc3Fz/Rh2jRo2qtd3o0aMlHS8Wc3NzTYnlrLPOCni+6kpZUzYVGTJkiDp06KCffvpJZ511lh555BFt2bJFfJQrALQsFG8AgGbN7Xb7j5tS2Ozbt89/XHUzkF+quoNl1T6hVNt7ziIj/7N/mNfrbfS4bdu21YoVK9SxY0dt3rxZN998s0499VS1a9dOv//97/X88883aVwAQPNC8QYAaNb69u3rP960aZOFkTRvo0aNUm5urp599llNnjxZp5xyig4dOqTMzExdc801GjRokKmbsQAAzEfxBgBo1kaOHOl/n9irr77a6P6dOnXyH+/atavWdlWvVe1TdVXs6NGjAfseOnSo0XGZIT4+Xtdcc43S09O1bds27dq1S4sWLVJMTIx/RQ4AYF8UbwCAZq1z58667LLLJEkvvPCCtm3b1uC+hmEoJSXFv9FJdnZ2rW3fffddScdfppmSkuI/365dO//xzp07A/Zdt25dg2NqrMrCtSnvX+vevbtuu+02zZo1S5KUlZUV0tgAAOFF8QYAaPbuueceJSQk6MiRIxo/fny9L/87ePCgLrvsMh06dEgOh0NXXHGFJGnp0qXas2dPjfa7d+/W0qVLJUl/+MMfql3r1auXYmNjJUkrV66s0dfn85n68QJJSUmS5P9w8UDKysrqHKMy/kA7XQIA7IO7OACg2evVq5eee+45RUVFafPmzfrVr36lRYsW6YcffvC3qaio0KZNmzRv3jydfPLJeuWVV/zX7rjjDrVt21YHDhzQqFGjtHbtWv+1Tz75RKNGjVJhYaHat2+v22+/vdrcLpfLv/J333336Z///KeOHTsmSdq6dasuvfRSffXVV6Y99n79+kmSioqK/B958EuLFi3S2LFj9dxzz1V7+WdZWZn++c9/avHixZKkCy+80LQ4AQDmcxjsIwwAsIlPPvlEU6ZMqVa0RUVFKSEhQYWFhfL5fJIkh8OhK6+8UsuWLZPL5ZIkffDBB7rkkkv870+Lj4+XJP/norVt21avv/66hg8fXmPeXbt26ayzztLu3bslHS/oYmNjVVRUpMTERGVmZmrEiBGSpPfff99/LElr1qzRyJEjJdX90keHwxGwv3R8M5LKl3wmJib6XwY6Y8YMzZgxQx6PRwsWLPC3j42NVWxsrA4ePOif89RTT9V7770X8HPuAAD2wMobAMA2hg4dqi1btmjFihW6+uqr1bNnT8XExKi4uFjt27fXsGHDdOedd+q7777TCy+84C/cJOm3v/2tvvvuO82aNUunnnqqfD6fDMPQqaeeqtmzZ+u7774LWLhJxz9GYN26dbruuuv8HzeQkJCgSZMmaePGjfrtb39r6uPOyMjQrbfeql69esnr9SovL095eXn+l1LecMMNeuKJJ/SHP/xB/fr1U1xcnIqKitSuXTsNHz5cDz/8sDZu3EjhBgA2x8obAAAAANgAK28AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgAxRvAAAAAGADFG8AAAAAYAMUbwAAAABgA/8f68Y+ceePqbAAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "# 전체 글꼴 크기 설정\n", + "plt.rcParams.update({'font.size': 18})\n", + "\n", + "# CSV 파일 읽기\n", + "df = pd.read_csv('raw_data.csv')\n", + "\n", + "# ships_idx 별 전체 갯수 계산\n", + "total_counts = df['ships_idx'].value_counts().sort_index()\n", + "\n", + "# ships_idx 별 MDM=True 인 갯수 계산\n", + "mdm_true_counts = df[df['MDM'] == True]['ships_idx'].value_counts().sort_index()\n", + "\n", + "# 데이터프레임으로 합치기\n", + "summary_df = pd.DataFrame({\n", + " 'SD': total_counts,\n", + " 'PD': mdm_true_counts\n", + "}).fillna(0) # NaN 값을 0으로 대체\n", + "\n", + "# 시각화\n", + "fig, ax = plt.subplots(figsize=(10, 8))\n", + "\n", + "# Total Counts 먼저 그리기\n", + "summary_df['SD'].plot(kind='barh', ax=ax, color='orange', alpha=0.5, label='SD')\n", + "\n", + "# MDM=True Counts를 그 위에 겹쳐서 그리기\n", + "summary_df['PD'].plot(kind='barh', ax=ax, color='blue', alpha=0.7, label='PD')\n", + "\n", + "# y축 라벨 설정 (5 단위로만 표시)\n", + "y_labels = ax.get_yticks()\n", + "ax.set_yticks(np.arange(min(y_labels), max(y_labels)+1, 5))\n", + "ax.set_yticklabels([int(label) for label in np.arange(min(y_labels), max(y_labels)+1, 5)])\n", + "\n", + "# 그리드 추가\n", + "ax.grid(True)\n", + "\n", + "# 범례와 제목 설정\n", + "plt.legend(prop={'size': 18}) # 레전드 글꼴 크기 설정\n", + "plt.xlabel('Counts')\n", + "plt.ylabel('Ships')\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_preprocess/no_preprocess/copy_raw_data.py b/data_preprocess/no_preprocess/copy_raw_data.py new file mode 100644 index 0000000..c4d60da --- /dev/null +++ b/data_preprocess/no_preprocess/copy_raw_data.py @@ -0,0 +1,9 @@ +import shutil + +source_file = 'data_import/raw_data.csv' + +destination_file = 'data_preprocess/preprocessed_data.csv' + +shutil.copy(source_file, destination_file) + +print(f"File copied from {source_file} to {destination_file}") diff --git a/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb b/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb new file mode 100644 index 0000000..9b021a1 --- /dev/null +++ b/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Changes made in ships_idx 1000: 251\n", + "Changes made in ships_idx 1001: 54\n", + "Changes made in ships_idx 1002: 46\n", + "Changes made in ships_idx 1003: 162\n", + "Changes made in ships_idx 1004: 8\n", + "Changes made in ships_idx 1005: 18\n", + "Changes made in ships_idx 1008: 22\n", + "Changes made in ships_idx 1009: 5\n", + "Changes made in ships_idx 1010: 135\n", + "Changes made in ships_idx 1011: 46\n", + "Changes made in ships_idx 1012: 2\n", + "Changes made in ships_idx 1013: 130\n", + "Changes made in ships_idx 1014: 46\n", + "Changes made in ships_idx 1015: 147\n", + "Changes made in ships_idx 1016: 191\n", + "Changes made in ships_idx 1017: 111\n", + "Changes made in ships_idx 1018: 682\n", + "Changes made in ships_idx 1019: 2\n", + "Changes made in ships_idx 1020: 10\n", + "Changes made in ships_idx 1021: 2\n", + "Changes made in ships_idx 1022: 7\n", + "Changes made in ships_idx 1023: 7\n", + "Changes made in ships_idx 1024: 136\n", + "Changes made in ships_idx 1025: 10\n", + "Changes made in ships_idx 1026: 6\n", + "Changes made in ships_idx 1027: 6\n", + "Changes made in ships_idx 1028: 6\n", + "Changes made in ships_idx 1029: 132\n", + "Changes made in ships_idx 1030: 86\n", + "Changes made in ships_idx 1031: 55\n", + "Changes made in ships_idx 1032: 225\n", + "Changes made in ships_idx 1033: 147\n", + "Changes made in ships_idx 1035: 132\n", + "Changes made in ships_idx 1036: 12\n", + "Changes made in ships_idx 1037: 3\n", + "Changes made in ships_idx 1038: 8\n", + "Changes made in ships_idx 1039: 232\n", + "Changes made in ships_idx 1042: 20\n", + "Changes made in ships_idx 1043: 154\n", + "Changes made in ships_idx 1044: 121\n", + "Changes made in ships_idx 1045: 255\n", + "Changes made in ships_idx 1046: 6\n", + "Changes made in ships_idx 1047: 12\n", + "Changes made in ships_idx 1048: 82\n", + "Changes made in ships_idx 1049: 912\n", + "Changes made in ships_idx 1050: 46\n", + "Changes made in ships_idx 1051: 63\n", + "Total number of changes made: 4951\n", + "Updated data saved to raw_data_add_tag.csv\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Load the preprocessed data CSV file\n", + "file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n", + "data = pd.read_csv(file_path, dtype=str)\n", + "\n", + "# Initialize a counter for the total number of changes\n", + "total_changes = 0\n", + "\n", + "# Initialize a dictionary to count changes per ships_idx\n", + "ships_idx_changes = {}\n", + "\n", + "# Process each group by ships_idx\n", + "for ships_idx, group in data.groupby('ships_idx'):\n", + " # Find duplicated tag_descriptions within the group\n", + " duplicated_descriptions = group['tag_description'].duplicated(keep=False)\n", + " \n", + " # Count how many tag_descriptions are duplicated within this ships_idx\n", + " num_changes = duplicated_descriptions.sum()\n", + "\n", + " # If there are any duplicates\n", + " if num_changes > 0:\n", + " # Increment the total changes count\n", + " total_changes += num_changes\n", + " \n", + " # Record the number of changes for this ships_idx\n", + " ships_idx_changes[ships_idx] = num_changes\n", + "\n", + " # Apply the concatenation of tag_name to tag_description for duplicates\n", + " data.loc[duplicated_descriptions & (data['ships_idx'] == ships_idx), 'tag_description'] = \\\n", + " data['tag_name'] + ' ' + data['tag_description']\n", + "\n", + "# Output the changes per ships_idx\n", + "for ships_idx, count in ships_idx_changes.items():\n", + " print(f\"Changes made in ships_idx {ships_idx}: {count}\")\n", + "\n", + "# Output the total number of changes\n", + "print(f\"Total number of changes made: {total_changes}\")\n", + "\n", + "# Optionally, save the updated DataFrame back to a CSV\n", + "output_file_path = 'raw_data_add_tag.csv'\n", + "data.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", + "\n", + "print(f\"Updated data saved to {output_file_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_preprocess/rule_base_replacement/2.seperate_number.ipynb b/data_preprocess/rule_base_replacement/2.seperate_number.ipynb new file mode 100644 index 0000000..1f8fce1 --- /dev/null +++ b/data_preprocess/rule_base_replacement/2.seperate_number.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated data saved to raw_data_s.csv\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import re\n", + "\n", + "# Load the data_mapping CSV file\n", + "data_mapping_file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n", + "# data_mapping_file_path = 'raw_data_add_tag.csv' # Adjust this path to your actual file location\n", + "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n", + "\n", + "# Backup the original tag_description\n", + "data_mapping['org_tag_description'] = data_mapping['tag_description']\n", + "\n", + "# Ensure all values in the 'tag_description' column are strings\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n", + "\n", + "# Function to find tokens containing numbers\n", + "def find_tokens_with_numbers(description):\n", + " tokens = description.split() # Tokenize by spaces\n", + " number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n", + " return number_tokens\n", + "\n", + "# Function to process tokens\n", + "def process_token(token):\n", + " # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n", + " token = re.sub(r'(_|-)(?=\\d)', ' ', token)\n", + " token = re.sub(r'(?<=\\d)(_|-)', ' ', token)\n", + "\n", + " # Step 2: Insert spaces between letters and numbers where no separator exists\n", + " token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n", + " token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n", + "\n", + " # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n", + " token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n", + "\n", + " # Clean multiple spaces and strip\n", + " token = re.sub(r'\\s+', ' ', token).strip()\n", + " return token\n", + "\n", + "# Apply the process to each row in the 'tag_description' column\n", + "for index, row in data_mapping.iterrows():\n", + " original_description = row['tag_description']\n", + " number_tokens = find_tokens_with_numbers(original_description)\n", + "\n", + " # Process each token containing numbers\n", + " processed_tokens = [process_token(token) for token in number_tokens]\n", + "\n", + " # Replace the original tokens with processed tokens in the tag_description\n", + " new_description = original_description\n", + " for original_token, processed_token in zip(number_tokens, processed_tokens):\n", + " new_description = new_description.replace(original_token, processed_token)\n", + "\n", + " # Update the data_mapping with the modified description\n", + " data_mapping.at[index, 'tag_description'] = new_description\n", + "\n", + "# Save the updated data_mapping to a new CSV file\n", + "output_file_path = 'raw_data_s.csv'\n", + "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", + "\n", + "print(f\"Updated data saved to {output_file_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_preprocess/rule_base_replacement/3.replacement.ipynb b/data_preprocess/rule_base_replacement/3.replacement.ipynb new file mode 100644 index 0000000..8aa43bf --- /dev/null +++ b/data_preprocess/rule_base_replacement/3.replacement.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated data saved to ../preprocessed_data.csv\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import re\n", + "\n", + "# Load the data_mapping CSV file\n", + "data_mapping_file_path = 'raw_data_s.csv' # Adjust this path to your actual file location\n", + "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n", + " \n", + " # Ensure all values in the 'tag_description' column are strings\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n", + "\n", + "# Initial replacement mapping\n", + "initial_replacements = {\n", + " \"MGE\": \"G/E\",\n", + " \"GEN.\": \"G/E\",\n", + " \"GEN\": \"G/E\",\n", + " \"GE\": \"G/E\",\n", + " \"G_E\": \"G/E\",\n", + " \"ME\": \"M/E\",\n", + " \"M_E\": \"M/E\",\n", + " \"S_G\": \"S/G\",\n", + " \"T_C\": \"T/C\",\n", + " \"TC\": \"T/C\",\n", + " \"L_O\": \"L.O\",\n", + " \"LO\": \"L.O\",\n", + " \"F_O\": \"F.O\",\n", + " \"FO\": \"F.O\",\n", + " \"D_G\": \"D/G\",\n", + " \"DG\": \"D/G\",\n", + " \"PP\": \"P/P\"\n", + "}\n", + "\n", + "# Second replacement mapping\n", + "second_replacements = {\n", + " \"_G/E\": \" G/E\",\n", + " \"G/E_\": \"G/E \",\n", + " \"_M/E\": \" M/E\",\n", + " \"M/E_\": \"M/E \",\n", + " \"_S/G\": \" S/G\",\n", + " \"S/G_\": \"S/G \",\n", + " \"_T/C\": \" T/C\",\n", + " \"T/C_\": \"T/C \",\n", + " \"_L.O\": \" L.O\",\n", + " \"L.O_\": \"L.O \",\n", + " \"_F.O\": \" F.O\",\n", + " \"F.O_\": \"F.O \",\n", + " \"_D/G\": \" D/G\",\n", + " \"D/G_\": \"D/G \",\n", + " \"DG_\": \"DG \"\n", + "}\n", + "\n", + "# Function to separate numbers from text in a token\n", + "def separate_numbers_from_text(description):\n", + " # This regex pattern finds occurrences where text is followed by numbers or vice versa\n", + " return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n", + "\n", + "# Function to perform replacements using tokens\n", + "def replace_tokens(description, replacements):\n", + " tokens = description.split() # Tokenize by spaces\n", + " tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary\n", + " return ' '.join(tokens)\n", + "\n", + "# Function to perform replacements for substrings\n", + "def replace_substrings(description, replacements):\n", + " for old, new in replacements.items():\n", + " description = description.replace(old, new)\n", + " return description\n", + "\n", + "# Separate numbers from text before applying replacements\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n", + "\n", + "# Apply initial replacements\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n", + "\n", + "# Apply second replacements as substrings\n", + "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n", + "\n", + "# Save the updated data_mapping to a new CSV file\n", + "output_file_path = '../preprocessed_data.csv'\n", + "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", + "\n", + "print(f\"Updated data saved to {output_file_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_preprocess/split_data.ipynb b/data_preprocess/split_data.ipynb new file mode 100644 index 0000000..11fd087 --- /dev/null +++ b/data_preprocess/split_data.ipynb @@ -0,0 +1,441 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final Group Allocation:\n", + "Group 1: Ships_idx = [1003, 1028, 1049, 1044, 1020, 1041, 1045, 1036, 1005, 1006], PD type = 537, PD = 2006, SD = 14719\n", + "Group 2: Ships_idx = [1025, 1035, 1021, 1026, 1002, 1030, 1024, 1037, 1038, 1029], PD type = 537, PD = 1958, SD = 8173\n", + "Group 3: Ships_idx = [1016, 1046, 1031, 1009, 1048, 1043, 1042, 1019, 1018, 1007, 1000], PD type = 534, PD = 2079, SD = 15310\n", + "Group 4: Ships_idx = [1004, 1032, 1039, 1014, 1040, 1017, 1022, 1051, 1008, 1050, 1013], PD type = 532, PD = 2066, SD = 12882\n", + "Group 5: Ships_idx = [1047, 1015, 1027, 1010, 1011, 1001, 1034, 1023, 1012, 1033], PD type = 531, PD = 2064, SD = 10988\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from collections import defaultdict\n", + "\n", + "# Function to calculate the number of unique combinations and total count for each ship\n", + "def calculate_ship_count(group):\n", + " ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index()\n", + " ship_count.columns = ['ships_idx', 'comb_count', 'total_count']\n", + " return ship_count\n", + "\n", + "# Function to calculate the combination count and total count for a group\n", + "def calculate_group_count(group):\n", + " comb_count = group['thing_property'].nunique()\n", + " total_count = group['thing_property'].size\n", + " return comb_count, total_count\n", + "\n", + "# Function to calculate the increase in combination count when a ship is added to a group\n", + "def calculate_comb_count_increase(groups, g, ship_idx, mdm):\n", + " temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n", + " temp_groups[g].append(ship_idx)\n", + " \n", + " group_ships = temp_groups[g]\n", + " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", + " \n", + " new_comb_count, _ = calculate_group_count(group_data)\n", + " \n", + " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", + " current_comb_count, _ = calculate_group_count(current_group_data)\n", + " \n", + " increase = new_comb_count - current_comb_count\n", + " \n", + " return increase\n", + "\n", + "# Function to calculate the increase in total count when a ship is added to a group\n", + "def calculate_total_count_increase(groups, g, ship_idx, mdm):\n", + " temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n", + " temp_groups[g].append(ship_idx)\n", + " \n", + " group_ships = temp_groups[g]\n", + " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", + " \n", + " _, new_total_count = calculate_group_count(group_data)\n", + " \n", + " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", + " _, current_total_count = calculate_group_count(current_group_data)\n", + " \n", + " increase = new_total_count - current_total_count\n", + " \n", + " return increase\n", + "\n", + "# Function to find the ship that will bring the total count closest to the target\n", + "def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count):\n", + " total_count_differences = []\n", + "\n", + " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", + " _, current_total_count = calculate_group_count(current_group_data)\n", + "\n", + " for ship_idx in remaining_ships:\n", + " increase = calculate_total_count_increase(groups, g, ship_idx, mdm)\n", + " new_total_count = current_total_count + increase\n", + " difference = abs(target_total_count - new_total_count)\n", + " total_count_differences.append((ship_idx, difference, increase))\n", + "\n", + " if not total_count_differences:\n", + " return None, 0\n", + " \n", + " closest_ship = min(total_count_differences, key=lambda x: x[1])\n", + " selected_ship_idx, _, selected_increase = closest_ship\n", + "\n", + " return selected_ship_idx, selected_increase\n", + "\n", + "# Function to find the ship that gives the maximum increase in combination count\n", + "def find_max_increase_ship(groups, g, remaining_ships, mdm):\n", + " comb_count_increase = []\n", + "\n", + " for ship_idx in remaining_ships:\n", + " increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n", + " comb_count_increase.append((ship_idx, increase))\n", + "\n", + " max_increase_ship = max(comb_count_increase, key=lambda x: x[1])\n", + " selected_ship_idx, max_increase = max_increase_ship\n", + " \n", + " return selected_ship_idx, max_increase\n", + "\n", + "# Function to find the ship that will bring the combination count closest to the target\n", + "def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count):\n", + " comb_count_differences = []\n", + "\n", + " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", + " current_comb_count, _ = calculate_group_count(current_group_data)\n", + "\n", + " for ship_idx in remaining_ships:\n", + " increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n", + " new_comb_count = current_comb_count + increase\n", + " difference = abs(target_comb_count - new_comb_count)\n", + " comb_count_differences.append((ship_idx, difference, increase))\n", + "\n", + " if not comb_count_differences:\n", + " return None, 0\n", + "\n", + " closest_ship = min(comb_count_differences, key=lambda x: x[1])\n", + " selected_ship_idx, _, selected_increase = closest_ship\n", + "\n", + " return selected_ship_idx, selected_increase\n", + "\n", + "# Function to find the group with the maximum combination count\n", + "def find_group_with_max_comb_count(groups, mdm):\n", + " max_comb_count = -1\n", + " max_group_idx = -1\n", + "\n", + " for g in range(len(groups)):\n", + " group_ships = groups[g]\n", + " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", + " comb_count, _ = calculate_group_count(group_data)\n", + " \n", + " if comb_count > max_comb_count:\n", + " max_comb_count = comb_count\n", + " max_group_idx = g\n", + "\n", + " return max_group_idx, max_comb_count\n", + "\n", + "# Function to find the group with the maximum total count\n", + "def find_group_with_max_total_count(groups, mdm):\n", + " max_total_count = -1\n", + " max_group_idx = -1\n", + "\n", + " for g in range(len(groups)):\n", + " group_ships = groups[g]\n", + " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", + " _, total_count = calculate_group_count(group_data)\n", + " \n", + " if total_count > max_total_count:\n", + " max_total_count = total_count\n", + " max_group_idx = g\n", + "\n", + " return max_group_idx, max_total_count\n", + "\n", + "import pandas as pd\n", + "from collections import defaultdict\n", + "\n", + "# Load the CSV file\n", + "data_file_path = 'preprocessed_data.csv'\n", + "data = pd.read_csv(data_file_path)\n", + "\n", + "# Filter the data where MDM is True\n", + "mdm_true = data[data['MDM'] == True].copy() # .copy()를 사용하여 명시적으로 복사본 생성\n", + "mdm_all = data.copy()\n", + "\n", + "# Create a new column combining 'thing' and 'property'\n", + "mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property']\n", + "mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property']\n", + "\n", + "# Initial setup for groups\n", + "ship_count = calculate_ship_count(mdm_true)\n", + "num_groups = 5\n", + "groups = defaultdict(list)\n", + "\n", + "# Sort ships by combination count in descending order\n", + "sorted_ships = ship_count.sort_values(by='comb_count', ascending=False)\n", + "\n", + "# Assign the first 5 ships to the groups\n", + "for i in range(num_groups):\n", + " groups[i].append(sorted_ships.iloc[i]['ships_idx'])\n", + "\n", + "remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values\n", + "\n", + "# Allocate remaining ships to the groups\n", + "while len(remaining_ships) > 0:\n", + " group_comb_counts = []\n", + " for g in range(num_groups):\n", + " group_ships = groups[g]\n", + " group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", + " comb_count, _ = calculate_group_count(group_data)\n", + " group_comb_counts.append((g, comb_count))\n", + "\n", + " group_comb_counts.sort(key=lambda x: x[1])\n", + " \n", + " remaining_group = []\n", + " for g, _ in group_comb_counts:\n", + " if len(remaining_ships) == 0:\n", + " break\n", + " \n", + " if group_comb_counts.index((g, _)) == 0:\n", + " selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)\n", + " \n", + " else:\n", + " max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)\n", + " selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)\n", + "\n", + " if comb_increase == 0:\n", + " remaining_group.append(g)\n", + " else:\n", + " groups[g].append(selected_ship_idx)\n", + " remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n", + "\n", + " for g in remaining_group:\n", + " if len(remaining_ships) == 0:\n", + " break\n", + " max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true)\n", + " selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count)\n", + " if selected_ship_idx is not None:\n", + " groups[g].append(selected_ship_idx)\n", + " remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n", + "\n", + "# Calculate comb_count for each group and store it in a list\n", + "group_comb_counts = []\n", + "for g in range(num_groups):\n", + " group_ships = groups[g]\n", + " group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", + " comb_count, total_count = calculate_group_count(group_data_true)\n", + "\n", + " # Calculate total count including MDM=False\n", + " group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n", + " _, total_count_all = calculate_group_count(group_data_all)\n", + " \n", + " group_comb_counts.append((g, comb_count, total_count_all))\n", + "\n", + "# Sort the groups by comb_count in descending order\n", + "group_comb_counts.sort(key=lambda x: x[1], reverse=True)\n", + "\n", + "# Reorder the groups dictionary based on the sorted order\n", + "sorted_groups = defaultdict(list)\n", + "for i, (g, _, _) in enumerate(group_comb_counts):\n", + " sorted_groups[i] = groups[g]\n", + "\n", + "# Final output of group allocation\n", + "print(\"Final Group Allocation:\")\n", + "for g in range(num_groups):\n", + " group_ships = sorted_groups[g]\n", + " group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", + " comb_count, total_count = calculate_group_count(group_data_true)\n", + "\n", + " # Calculate total count including MDM=False\n", + " group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n", + " _, total_count_all = calculate_group_count(group_data_all)\n", + "\n", + " print(f\"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CSV file has been generated: 'combined_group_allocation.csv'\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import GroupKFold\n", + "\n", + "# Prepare data for custom group allocation (BGKF)\n", + "comb_counts = []\n", + "total_counts = []\n", + "ship_counts = []\n", + "custom_results = []\n", + "\n", + "for g in range(num_groups):\n", + " group_ships = groups[g]\n", + " group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", + " comb_count, total_count = calculate_group_count(group_data_true)\n", + " \n", + " # Calculate total count including MDM=False\n", + " group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n", + " _, total_count_all = calculate_group_count(group_data_all)\n", + " \n", + " custom_results.append({\n", + " 'Group': g + 1,\n", + " 'Allocation': 'BGKF',\n", + " 'Comb_count': comb_count,\n", + " 'Total_count': total_count,\n", + " 'Total_count_all': total_count_all,\n", + " 'Ship_count': len(group_ships),\n", + " 'Ships_idx': list(group_ships)\n", + " })\n", + "\n", + "# Sort the custom group allocation by comb_count in descending order\n", + "custom_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n", + "\n", + "# Adjust group numbers after sorting\n", + "for i, result in enumerate(custom_results):\n", + " result['Group'] = i + 1\n", + "\n", + "# Prepare data for GroupKFold allocation (GKF)\n", + "gkf = GroupKFold(n_splits=5)\n", + "gkf_results = []\n", + "\n", + "for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])):\n", + " test_group = mdm_true.iloc[test_idx]\n", + " comb_count, total_count = calculate_group_count(test_group)\n", + " \n", + " # Calculate total count including MDM=False\n", + " test_group_ships = test_group['ships_idx'].unique()\n", + " test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)]\n", + " _, total_count_all = calculate_group_count(test_group_all)\n", + " \n", + " gkf_results.append({\n", + " 'Group': i + 1,\n", + " 'Allocation': 'GKF',\n", + " 'Comb_count': comb_count,\n", + " 'Total_count': total_count,\n", + " 'Total_count_all': total_count_all,\n", + " 'Ship_count': test_group['ships_idx'].nunique(),\n", + " 'Ships_idx': list(test_group['ships_idx'].unique())\n", + " })\n", + "\n", + "# Sort the GKF allocation by comb_count in descending order\n", + "gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n", + "\n", + "# Adjust group numbers after sorting\n", + "for i, result in enumerate(gkf_results):\n", + " result['Group'] = i + 1\n", + "\n", + "# Combine BGKF and GKF results into one DataFrame\n", + "combined_results = custom_results + gkf_results\n", + "combined_df = pd.DataFrame(combined_results)\n", + "\n", + "# Output the combined results to a single CSV file\n", + "combined_df.to_csv('combined_group_allocation.csv', index=False)\n", + "\n", + "print(\"CSV file has been generated: 'combined_group_allocation.csv'\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Group 1 datasets saved in dataset/1\n", + "Group 2 datasets saved in dataset/2\n", + "Group 3 datasets saved in dataset/3\n", + "Group 4 datasets saved in dataset/4\n", + "Group 5 datasets saved in dataset/5\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "from sklearn.model_selection import KFold\n", + "\n", + "def save_datasets_for_group(groups, mdm, data, output_dir='dataset', n_splits=4):\n", + " for i in range(len(groups)):\n", + " group_folder = os.path.join(output_dir, str(i + 1))\n", + " os.makedirs(group_folder, exist_ok=True)\n", + " \n", + " # Create the test dataset by including only group i\n", + " test_group_ships = groups[i]\n", + " test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]\n", + " \n", + " # Extract corresponding entries from the external test dataset\n", + " test_all_data = data[data['ships_idx'].isin(test_group_ships)]\n", + " \n", + " # Create the train dataset by excluding group i\n", + " train_group_ships = []\n", + " for g in range(len(groups)):\n", + " if g != i:\n", + " train_group_ships.extend(groups[g])\n", + " train_data = mdm[mdm['ships_idx'].isin(train_group_ships)]\n", + " \n", + " # Use KFold to split train_data into train and valid datasets\n", + " kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42)\n", + " train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data))\n", + " \n", + " final_train_data = train_data.iloc[train_idx_inner]\n", + " valid_data = train_data.iloc[valid_idx_inner]\n", + " \n", + " # Combine train and valid data to create train_all\n", + " train_all_data = pd.concat([final_train_data, valid_data])\n", + " \n", + " # Save datasets to CSV files\n", + " train_file_path = os.path.join(group_folder, 'train.csv')\n", + " valid_file_path = os.path.join(group_folder, 'valid.csv')\n", + " test_file_path = os.path.join(group_folder, 'test.csv')\n", + " test_all_file_path = os.path.join(group_folder, 'test_all.csv')\n", + " train_all_file_path = os.path.join(group_folder, 'train_all.csv')\n", + " \n", + " final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')\n", + " valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')\n", + " # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n", + " test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n", + " train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')\n", + " \n", + " print(f\"Group {i + 1} datasets saved in {group_folder}\")\n", + "\n", + "# Example usage:\n", + "save_datasets_for_group(groups, mdm_true, data, n_splits=4)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/evaluation/check_accuracy.ipynb b/evaluation/check_accuracy.ipynb new file mode 100644 index 0000000..0a5773e --- /dev/null +++ b/evaluation/check_accuracy.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performance for all_with_p_s.csv:\n", + "TP: 1724, TN: 11907, FP: 919, FN: 272\n", + "Precision: 0.6523, Recall: 0.8637, Accuracy: 0.9196\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Set the group number\n", + "group_number = 1 # Change this to the desired group number\n", + "\n", + "# File paths for the two datasets\n", + "test_s_path = f'../post_process/0.result/{group_number}/test_s.csv'\n", + "\n", + "# Load the CSV files\n", + "test_s_csv = pd.read_csv(test_s_path, low_memory=False)\n", + "test_s_csv.fillna('', inplace=True)\n", + "\n", + "def evaluate_performance(test_csv):\n", + " # Initialize counters for TP, TN, FP, FN\n", + " TP = 0\n", + " TN = 0\n", + " FP = 0\n", + " FN = 0\n", + "\n", + " # Iterate over the DataFrame rows\n", + " for index, row in test_csv.iterrows():\n", + " # True Positive (TP): s_correct is True and MDM is True\n", + " if row['s_correct'] and row['MDM']:\n", + " TP += 1\n", + " # True Negative (TN): s_thing is null and MDM is False\n", + " elif row['s_thing'] == '' and not row['MDM']:\n", + " TN += 1\n", + " # False Positive (FP): \n", + " # 1) s_thing is not null and MDM is False \n", + " # OR \n", + " # 2) s_thing is not null and s_correct is False and MDM is True\n", + " elif (row['s_thing'] != '' and not row['MDM']) or (row['s_thing'] != '' and not row['s_correct'] and row['MDM']):\n", + " FP += 1\n", + " # False Negative (FN): s_thing is null and MDM is True\n", + " elif row['s_thing'] == '' and row['MDM']:\n", + " FN += 1\n", + "\n", + " # Calculate total\n", + " total = TP + TN + FP + FN\n", + "\n", + " # Calculate Precision, Recall, and Accuracy\n", + " precision = TP / (TP + FP) if (TP + FP) > 0 else 0\n", + " recall = TP / (TP + FN) if (TP + FN) > 0 else 0\n", + " accuracy = (TP + TN) / total if total > 0 else 0\n", + "\n", + " return TP, TN, FP, FN, precision, recall, accuracy\n", + "\n", + "# Evaluate both datasets\n", + "tp_s_results = evaluate_performance(test_s_csv)\n", + "\n", + "# Print the results for both datasets\n", + "print(\"Performance for all_with_p_s.csv:\")\n", + "print(f\"TP: {tp_s_results[0]}, TN: {tp_s_results[1]}, FP: {tp_s_results[2]}, FN: {tp_s_results[3]}\")\n", + "print(f\"Precision: {tp_s_results[4]:.4f}, Recall: {tp_s_results[5]:.4f}, Accuracy: {tp_s_results[6]:.4f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/post_process/tfidf_class/1.make_sdl_class_document.py b/post_process/tfidf_class/1.make_sdl_class_document.py new file mode 100644 index 0000000..377a41c --- /dev/null +++ b/post_process/tfidf_class/1.make_sdl_class_document.py @@ -0,0 +1,47 @@ +import pandas as pd +import re +import os + +# Loop through group numbers from 1 to 5 +for group_number in range(1, 6): + + # Path to the train_all file + train_all_path = f'data_preprocess/dataset/{group_number}/train_all.csv' + + # Read the train_all data + train_all_csv = pd.read_csv(train_all_path, low_memory=False) + + # Concatenate tag_description based on the combination of thing and property + tag_description_concatenated = train_all_csv.groupby(['thing', 'property'])['tag_description'].apply(lambda x: ' '.join(x)).reset_index() + + # Concatenate tag_name based on the combination of thing and property + tag_name_concatenated = train_all_csv.groupby(['thing', 'property'])['tag_name'].apply(lambda x: ' '.join(x)).reset_index() + + # Calculate mapping_count + mapping_count = train_all_csv.groupby(['thing', 'property']).size().reset_index(name='mapping_count') + + # Merge the three DataFrames: mapping_count, tag_description_concatenated, and tag_name_concatenated + thing_property_grouped = pd.merge(mapping_count, tag_description_concatenated, on=['thing', 'property']) + thing_property_grouped = pd.merge(thing_property_grouped, tag_name_concatenated, on=['thing', 'property']) + + # Calculate token_count by splitting tag_description using r'\S+' + thing_property_grouped['td_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(re.findall(r'\S+', x))) + + # Create pattern by replacing digits in 'thing' and 'property' with '#' + thing_property_grouped['pattern'] = thing_property_grouped['thing'].str.replace(r'\d', '#', regex=True) + " " + thing_property_grouped['property'].str.replace(r'\d', '#', regex=True) + + # Calculate the total number of unique thing_property combinations + total_thing_property_count = thing_property_grouped.shape[0] + + # Specify the output path + output_path = f'post_process/tfidf_class/0.class_document/{group_number}/sdl_class_rdoc.csv' + + # Create the directory if it doesn't exist + output_dir = os.path.dirname(output_path) + os.makedirs(output_dir, exist_ok=True) + + # Save the result to the CSV file + thing_property_grouped.to_csv(output_path, index=False, encoding='utf-8-sig') + + print(f"Concatenated data saved to {output_path}") + print(f"Total number of unique thing_property combinations: {total_thing_property_count}") diff --git a/post_process/tfidf_class/2.classify_by_tfidf.ipynb b/post_process/tfidf_class/2.classify_by_tfidf.ipynb new file mode 100644 index 0000000..ef2344d --- /dev/null +++ b/post_process/tfidf_class/2.classify_by_tfidf.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy (MDM=True) for Group 1: 79.41%\n", + "Accuracy (MDM=True) for Group 2: 79.32%\n", + "Accuracy (MDM=True) for Group 3: 82.49%\n", + "Accuracy (MDM=True) for Group 4: 85.61%\n", + "Accuracy (MDM=True) for Group 5: 79.72%\n", + "Average Accuracy (MDM=True) across all groups: 81.31%\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from tqdm import tqdm\n", + "import os\n", + "\n", + "# Initialize a list to store the accuracies for each group\n", + "accuracies = []\n", + "\n", + "# Loop through group numbers from 1 to 5\n", + "for group_number in range(1, 6):\n", + " \n", + " # Load the CSV files from the specified group\n", + " sdl_class_rdoc_path = f'0.class_document/{group_number}/sdl_class_rdoc.csv'\n", + " test_path = f'../../data_preprocess/dataset/{group_number}/test.csv'\n", + " \n", + " # Check if test file exists, if not, skip this iteration\n", + " if not os.path.exists(test_path):\n", + " print(f\"test file for Group {group_number} does not exist. Skipping...\")\n", + " continue\n", + " \n", + " sdl_class_rdoc_csv = pd.read_csv(sdl_class_rdoc_path, low_memory=False)\n", + " test_csv = pd.read_csv(test_path, low_memory=False)\n", + " \n", + " # Replace NaN values with empty strings in relevant columns\n", + " sdl_class_rdoc_csv['tag_description'] = sdl_class_rdoc_csv['tag_description'].fillna('')\n", + " test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n", + " \n", + " # Initialize new columns in test_csv\n", + " test_csv['c_thing'] = ''\n", + " test_csv['c_property'] = ''\n", + " test_csv['c_score'] = ''\n", + " test_csv['c_duplicate'] = 0 # Initialize c_duplicate to store duplicate counts\n", + " \n", + " # Combine both sdl_class_rdoc and test CSVs tag_descriptions for TF-IDF Vectorizer training\n", + " combined_tag_descriptions = sdl_class_rdoc_csv['tag_description'].tolist() + test_csv['tag_description'].tolist()\n", + " \n", + " # Create a TF-IDF Vectorizer\n", + " vectorizer = TfidfVectorizer(\n", + " token_pattern=r'\\S+',\n", + " ngram_range=(1, 6), # Use ngrams from 1 to 6\n", + " )\n", + " \n", + " # Fit the TF-IDF vectorizer on the combined tag_descriptions\n", + " vectorizer.fit(combined_tag_descriptions)\n", + " \n", + " # Transform both sdl_class_rdoc and test CSVs into TF-IDF matrices\n", + " sdl_class_rdoc_tfidf_matrix = vectorizer.transform(sdl_class_rdoc_csv['tag_description'])\n", + " test_tfidf_matrix = vectorizer.transform(test_csv['tag_description'])\n", + " \n", + " # Calculate cosine similarity between test and class-level sdl_class_rdoc vectors\n", + " similarity_matrix = cosine_similarity(test_tfidf_matrix, sdl_class_rdoc_tfidf_matrix)\n", + " \n", + " # Find the most similar class-level tag_description for each test description\n", + " most_similar_indices = similarity_matrix.argmax(axis=1)\n", + " most_similar_scores = similarity_matrix.max(axis=1)\n", + " \n", + " # Assign the corresponding thing, property, and similarity score to the test CSV\n", + " test_csv['c_thing'] = sdl_class_rdoc_csv.iloc[most_similar_indices]['thing'].values\n", + " test_csv['c_property'] = sdl_class_rdoc_csv.iloc[most_similar_indices]['property'].values\n", + " test_csv['c_score'] = most_similar_scores\n", + " \n", + " # Check if the predicted 'c_thing' and 'c_property' match the actual 'thing' and 'property'\n", + " test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n", + " test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n", + " test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n", + " \n", + " # Calculate accuracy based only on MDM = True\n", + " mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n", + " accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100\n", + " accuracies.append(accuracy)\n", + " \n", + " print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n", + " \n", + " # Specify output file paths\n", + " output_path = f'0.class_document/{group_number}/test_p_c.csv'\n", + " test_csv.to_csv(output_path, index=False, encoding='utf-8-sig')\n", + " \n", + " # Filter for rows where MDM is True and ctp_correct is False\n", + " false_positive_rows = test_csv[(test_csv['MDM'] == True) & (test_csv['ctp_correct'] == False)]\n", + " \n", + " # Save false positives to a separate file\n", + " fp_output_path = f'0.class_document/{group_number}/fp_class.csv'\n", + " false_positive_rows.to_csv(fp_output_path, index=False, encoding='utf-8-sig')\n", + "\n", + "# Calculate and print the average accuracy across all groups\n", + "average_accuracy = sum(accuracies) / len(accuracies)\n", + "print(f\"Average Accuracy (MDM=True) across all groups: {average_accuracy:.2f}%\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/post_process/tfidf_class/3.refine.ipynb b/post_process/tfidf_class/3.refine.ipynb new file mode 100644 index 0000000..52ee5d0 --- /dev/null +++ b/post_process/tfidf_class/3.refine.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'p_correct'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'p_correct'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, row \u001b[38;5;129;01min\u001b[39;00m test_csv\u001b[38;5;241m.\u001b[39miterrows():\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mp_correct\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;129;01mand\u001b[39;00m row[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mctp_correct\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 23\u001b[0m update_count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;66;03m# Increment the counter\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# Check for duplicates within the same ships_idx\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/series.py:1121\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m 1120\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1123\u001b[0m \u001b[38;5;66;03m# Convert generator to list before going through hashable part\u001b[39;00m\n\u001b[1;32m 1124\u001b[0m \u001b[38;5;66;03m# (We will iterate through the generator there to check for slices)\u001b[39;00m\n\u001b[1;32m 1125\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n", + "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/series.py:1237\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1234\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m 1236\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1237\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1239\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m 1240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n", + "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'p_correct'" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from tqdm import tqdm\n", + "\n", + "# Set the group number\n", + "group_number = 1 # Change this to the desired group number\n", + "\n", + "# Load the CSV files from the specified group\n", + "sdl_class_rdoc_path = f'0.class_document/{group_number}/sdl_class_rdoc.csv'\n", + "test_path = f'0.class_document/{group_number}/test_p_c.csv'\n", + "\n", + "sdl_class_rdoc_csv = pd.read_csv(sdl_class_rdoc_path, low_memory=False)\n", + "test_csv = pd.read_csv(test_path, low_memory=False)\n", + "\n", + "update_count = 0\n", + "duplicate_count = 0\n", + "non_duplicate_count = 0\n", + "\n", + "# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\n", + "for index, row in test_csv.iterrows():\n", + " if not row['p_correct'] and row['ctp_correct']:\n", + " update_count += 1 # Increment the counter\n", + "\n", + " # Check for duplicates within the same ships_idx\n", + " same_idx_rows = test_csv[(test_csv['ships_idx'] == row['ships_idx']) &\n", + " (test_csv['p_thing'] == row['c_thing']) &\n", + " (test_csv['p_property'] == row['c_property'])]\n", + "\n", + " if len(same_idx_rows) > 0:\n", + " duplicate_count += 1\n", + " else:\n", + " non_duplicate_count += 1\n", + "\n", + "# Print the results\n", + "print(f\"Total updates where p_correct is False and ctp_correct is True: {update_count}\")\n", + "print(f\"Number of rows with duplicates in the same ships_idx: {duplicate_count}\")\n", + "print(f\"Number of rows without duplicates in the same ships_idx: {non_duplicate_count}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of updates made: 45\n", + "Updated test CSV saved to 0.class_document/1/test_p_c_r.csv\n", + "Refine CSV saved to refine.csv\n" + ] + } + ], + "source": [ + "update_count = 0\n", + "\n", + "# Initialize a list to hold rows that meet the conditions\n", + "refine_rows = []\n", + "\n", + "# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\n", + "for index, row in test_csv.iterrows():\n", + " if (not row['p_MDM'] and row['c_score'] >= 0.9 and \n", + " (row['p_thing'] != row['c_thing'] or row['p_property'] != row['c_property'])):\n", + " test_csv.at[index, 'p_thing'] = row['c_thing']\n", + " test_csv.at[index, 'p_property'] = row['c_property']\n", + " test_csv.at[index, 'p_MDM'] = True\n", + " update_count += 1 # Increment the counter\n", + " refine_rows.append(row) # Add the row to the refine list\n", + "\n", + "# Convert the list of refine rows into a DataFrame\n", + "refine_df = pd.DataFrame(refine_rows)\n", + "\n", + "# Save the refine DataFrame to a CSV file\n", + "refine_output_path = f'refine.csv'\n", + "refine_df.to_csv(refine_output_path, index=False, encoding='utf-8-sig')\n", + "\n", + "# Print the number of updates made\n", + "print(f\"Number of updates made: {update_count}\")\n", + "\n", + "# Save the updated test CSV\n", + "output_file_path = f'0.class_document/{group_number}/test_p_c_r.csv'\n", + "test_csv.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", + " \n", + "print(f\"Updated test CSV saved to {output_file_path}\")\n", + "print(f\"Refine CSV saved to {refine_output_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/post_process/tfidf_class/4.selection_by_tfidf.py b/post_process/tfidf_class/4.selection_by_tfidf.py new file mode 100644 index 0000000..0af7bc6 --- /dev/null +++ b/post_process/tfidf_class/4.selection_by_tfidf.py @@ -0,0 +1,114 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +from tqdm import tqdm +import os + +group_number = 1 +# Load the CSV files +test_path = f'post_process/tfidf_class/0.class_document/{group_number}/test_p_c.csv' +test_path = f'post_process/tfidf_class/0.class_document/{group_number}/test_p_c_r.csv' +ship_data_list_reference_doc_file_path = f'post_process/tfidf_class/0.class_document/{group_number}/sdl_class_rdoc.csv' + +test_csv = pd.read_csv(test_path, low_memory=False) +sdl_rdoc = pd.read_csv(ship_data_list_reference_doc_file_path) + +# Initialize new columns in test_csv +test_csv['s_score'] = -1 +test_csv['s_thing'] = '' +test_csv['s_property'] = '' +test_csv['s_correct'] = False + +duplicate_filtered = test_csv[(test_csv['p_MDM'] == True)].copy() + +# Create a mapping from thing/property to reference_doc +thing_property_to_reference_doc = sdl_rdoc.set_index(['thing', 'property'])['tag_description'].to_dict() + +# Calculate s_score for duplicate rows +for ships_idx, group in tqdm(duplicate_filtered.groupby('ships_idx'), desc="Processing duplicates"): + for (p_thing, p_property), sub_group in group.groupby(['p_thing', 'p_property']): + sub_group = sub_group.copy() + tag_descriptions = sub_group['tag_description'].tolist() + + # Get the reference document for the corresponding p_thing and p_property + reference_doc = thing_property_to_reference_doc.get((p_thing, p_property), '') + + if reference_doc: + # Combine the tag_descriptions and the reference_doc for fit_transform + combined_descriptions = tag_descriptions + [reference_doc] + + # Create a new TF-IDF Vectorizer for this specific group + vectorizer = TfidfVectorizer( + token_pattern=r'\S+', + norm='l2', # Use L2 normalization + ngram_range=(1, 7), # Use both unigrams and bigrams + ) + + # Fit and transform the combined descriptions + tfidf_matrix = vectorizer.fit_transform(combined_descriptions) + + # Separate the test_tfidf_matrix and reference_vector + test_tfidf_matrix = tfidf_matrix[:-1] # All but the last one + reference_vector = tfidf_matrix[-1] # The last one + + # Calculate the cosine similarity between the test descriptions and the reference_doc + sub_group['s_score'] = cosine_similarity(test_tfidf_matrix, reference_vector).flatten() + else: + sub_group['s_score'] = 0 + + # Update the s_score values back into the original test_csv + duplicate_filtered.loc[sub_group.index, 's_score'] = sub_group['s_score'] + +for ships_idx, group in tqdm(duplicate_filtered.groupby('ships_idx'), desc="Processing duplicates"): + for (p_thing, p_property), sub_group in group.groupby(['p_thing', 'p_property']): + if (sub_group['s_score'] == -1).any(): + best_index = sub_group.index.min() + else: + # Find the index of the row with the highest s_score + best_index = sub_group['s_score'].idxmax() + row_position = sub_group.index.get_loc(best_index) + + # Assign s_thing and s_property only to the row with the highest s_score + duplicate_filtered.at[best_index, 's_thing'] = sub_group.at[best_index, 'p_thing'] + duplicate_filtered.at[best_index, 's_property'] = sub_group.at[best_index, 'p_property'] + +# Now, update the original test_csv with the changes made in duplicate_filtered +test_csv.update(duplicate_filtered[['s_thing', 's_property', 's_score']]) + +# Calculate s_correct +test_csv['s_correct'] = ((test_csv['thing'] == test_csv['s_thing']) & + (test_csv['property'] == test_csv['s_property']) & + (test_csv['MDM'])) + +# Calculate the percentage of correct s_thing and s_property +mdm_true_count = test_csv['MDM'].sum() +s_correct_count = test_csv['s_correct'].sum() +s_correct_percentage = (s_correct_count / mdm_true_count) * 100 + +print(f"s_correct count: {s_correct_count}") +print(f"MDM true count: {mdm_true_count}") +print(f"s_correct percentage: {s_correct_percentage:.2f}%") + + +# Save the updated DataFrame to a new CSV file +output_path = test_path = f'post_process/0.result/{group_number}/test_s.csv' +os.makedirs(os.path.dirname(output_path), exist_ok=True) +test_csv.to_csv(output_path, index=False, encoding='utf-8-sig') + +print(f"Updated data saved to {output_path}") + +# Check for duplicates in s_thing and s_property within each ships_idx +print("\nShips_idx with duplicate s_thing and s_property:") +duplicate_ships_idx = [] + +for ships_idx, group in test_csv.groupby('ships_idx'): + # Exclude rows with empty s_thing or s_property + non_empty_group = group[(group['s_thing'] != '') & (group['s_property'] != '')] + duplicate_entries = non_empty_group[non_empty_group.duplicated(subset=['s_thing', 's_property'], keep=False)] + if not duplicate_entries.empty: + duplicate_ships_idx.append(ships_idx) + print(f"Ships_idx: {ships_idx}") + print(duplicate_entries[['s_thing', 's_property']]) + +if not duplicate_ships_idx: + print("No duplicates found.") diff --git a/translation/t5/1.data_process_concat.ipynb b/translation/t5/1.data_process_concat.ipynb new file mode 100644 index 0000000..fafd5a2 --- /dev/null +++ b/translation/t5/1.data_process_concat.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded data for group 1:\n", + "Train data shape: (6125, 16)\n", + "Valid data shape: (2042, 16)\n", + "Test data shape: (14719, 15)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "# Example usage:1\n", + "group_number = 1 # You can change this to any group number you want to load (1, 2, 3, 4, or 5)\n", + "\n", + "# Select the mode for processing\n", + "mode = 'tn_td_unit' # Change this to 'only_td', 'tn_td', etc., as needed\n", + "\n", + "def load_group_data(group_number):\n", + " # Define the folder path based on the group number\n", + " group_folder = os.path.join('../../data_preprocess/dataset', str(group_number))\n", + " \n", + " # Define file paths for train, valid, and test datasets\n", + " train_file_path = os.path.join(group_folder, 'train.csv')\n", + " valid_file_path = os.path.join(group_folder, 'valid.csv')\n", + " test_file_path = os.path.join(group_folder, 'test.csv')\n", + " \n", + " # Check if the files exist\n", + " if not os.path.exists(train_file_path) or not os.path.exists(valid_file_path) or not os.path.exists(test_file_path):\n", + " raise FileNotFoundError(f\"One or more files for group {group_number} do not exist.\")\n", + " \n", + " # Load the CSV files into DataFrames\n", + " train_data = pd.read_csv(train_file_path)\n", + " valid_data = pd.read_csv(valid_file_path)\n", + " test_data = pd.read_csv(test_file_path)\n", + " \n", + " return train_data, valid_data, test_data\n", + "\n", + "\n", + "try:\n", + " train_data, valid_data, test_data = load_group_data(group_number)\n", + " print(f\"Loaded data for group {group_number}:\")\n", + " print(f\"Train data shape: {train_data.shape}\")\n", + " print(f\"Valid data shape: {valid_data.shape}\")\n", + " print(f\"Test data shape: {test_data.shape}\")\n", + "except FileNotFoundError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "313f98ef12eb442bac319282e5ffe5d6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/1 shards): 0%| | 0/6125 [00:00{str(row['tag_description'])}\"\n", + " elif mode == 'tn_td':\n", + " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}\"\n", + " elif mode == 'tn_td_min_max':\n", + " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}{row['min']}{row['max']}\"\n", + " elif mode == 'td_min_max':\n", + " input_str = f\"{str(row['tag_description'])}{row['min']}{row['max']}\" \n", + " elif mode == 'td_unit':\n", + " input_str = f\"{str(row['tag_description'])}{str(row['unit'])}\" \n", + " elif mode == 'tn_td_unit':\n", + " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}{str(row['unit'])}\" \n", + " else:\n", + " raise ValueError(\"Invalid mode specified\")\n", + " \n", + " output_list.append({\n", + " 'translation': {\n", + " 'ships_idx': row['ships_idx'],\n", + " 'input': input_str,\n", + " 'thing_property': f\"{str(row['thing'])}{str(row['property'])}\",\n", + " 'answer': f\"{str(row['thing'])} {str(row['property'])}\",\n", + " }\n", + " })\n", + " except Exception as e:\n", + " print(f\"Error processing row at index {idx}: {row}\")\n", + " print(f\"Exception: {e}\")\n", + " return output_list\n", + "\n", + "\n", + "# Combine the mode and group information into a single dictionary\n", + "combined_dict = {\n", + " \"mode\": mode,\n", + " \"fold_group\": group_number\n", + "}\n", + "\n", + "# Save the combined dictionary to a JSON file\n", + "with open(\"mode.json\", \"w\") as json_file:\n", + " json.dump(combined_dict, json_file)\n", + " \n", + "try:\n", + " # Process the data and create a DatasetDict\n", + " combined_data = DatasetDict({\n", + " 'train': Dataset.from_list(process_df(train_data, mode=mode)),\n", + " 'test': Dataset.from_list(process_df(test_data, mode=mode)),\n", + " 'validation': Dataset.from_list(process_df(valid_data, mode=mode)),\n", + " })\n", + " # Save the DatasetDict to disk\n", + " combined_data.save_to_disk(f\"combined_data/{mode}/{group_number}\")\n", + " print(\"Dataset saved to 'combined_data'\")\n", + "except Exception as e:\n", + " print(f\"Error creating DatasetDict: {e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/translation/t5/2.t5_train.ipynb b/translation/t5/2.t5_train.ipynb new file mode 100644 index 0000000..ce98df6 --- /dev/null +++ b/translation/t5/2.t5_train.ipynb @@ -0,0 +1,477 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# t5 training for combined concatenated outputs (thing + property) \n", + "\n", + "refer to `t5_train_tp.py` and `guide_for_tp.md` for faster training workflow" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The mode has been set to: tn_td_unit\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d8d70681f4594917b7af4583a4237168", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/6125 [00:00\", \"\", \"\", \"\"]\n", + "additional_special_tokens = [\"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\"]\n", + "# Add the additional special tokens to the tokenizer\n", + "tokenizer.add_special_tokens({\"additional_special_tokens\": additional_special_tokens})\n", + "\n", + "max_length = 64\n", + "\n", + "def preprocess_function(examples):\n", + " inputs = [ex[\"input\"] for ex in examples['translation']]\n", + " targets = [ex[\"thing_property\"] for ex in examples['translation']]\n", + " # text_target sets the corresponding label to inputs\n", + " # there is no need to create a separate 'labels'\n", + " model_inputs = tokenizer(\n", + " inputs, text_target=targets, max_length=max_length, truncation=True\n", + " )\n", + " return model_inputs\n", + "\n", + "# map method maps preprocess_function to [train, valid, test] datasets of the datasetDict\n", + "tokenized_datasets = split_datasets.map(\n", + " preprocess_function,\n", + " batched=True,\n", + " remove_columns=split_datasets[\"train\"].column_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [3840/3840 42:37, Epoch 80/80]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
5002.812300
10000.699300
15000.440900
20000.332100
25000.276500
30000.245900
35000.229300

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n" + ] + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=3840, training_loss=0.6754856963952383, metrics={'train_runtime': 2559.4201, 'train_samples_per_second': 191.45, 'train_steps_per_second': 1.5, 'total_flos': 3.156037495934976e+16, 'train_loss': 0.6754856963952383, 'epoch': 80.0})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "import os\n", + "import json\n", + "\n", + "# we use the pre-trained t5-base model\n", + "from transformers import AutoModelForSeq2SeqLM\n", + "model_checkpoint = model_name\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n", + "\n", + "# data collator\n", + "from transformers import DataCollatorForSeq2Seq\n", + "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n", + "\n", + "# evaluation \n", + "import evaluate\n", + "metric = evaluate.load(\"sacrebleu\")\n", + "import numpy as np\n", + "\n", + "\n", + "def compute_metrics(eval_preds):\n", + " preds, labels = eval_preds\n", + " # In case the model returns more than the prediction logits\n", + " if isinstance(preds, tuple):\n", + " preds = preds[0]\n", + "\n", + " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n", + "\n", + " # Replace -100s in the labels as we can't decode them\n", + " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n", + " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n", + "\n", + " # Some simple post-processing\n", + " decoded_preds = [pred.strip() for pred in decoded_preds]\n", + " decoded_labels = [[label.strip()] for label in decoded_labels]\n", + "\n", + " result = metric.compute(predictions=decoded_preds, references=decoded_labels)\n", + " return {\"bleu\": result[\"score\"]}\n", + "\n", + "from transformers import Seq2SeqTrainingArguments\n", + "\n", + "# load environment variables to disable GPU p2p mode for multi-gpu training without p2p mode\n", + "# not required for single-gpu training\n", + "import os\n", + "os.environ['NCCL_P2P_DISABLE'] = '1'\n", + "os.environ['NCCL_IB_DISABLE'] = '1'\n", + "\n", + "args = Seq2SeqTrainingArguments(\n", + " f\"train_{fold_group}_{model_name}_{mode}_{train_epochs}\",\n", + " evaluation_strategy=\"no\",\n", + " # logging_dir=\"tensorboard-log\",\n", + " # logging_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " learning_rate=2e-5,\n", + " per_device_train_batch_size=32,\n", + " per_device_eval_batch_size=64,\n", + " auto_find_batch_size=True,\n", + " ddp_find_unused_parameters=False,\n", + " weight_decay=0.01,\n", + " save_total_limit=1,\n", + " num_train_epochs=train_epochs,\n", + " predict_with_generate=True,\n", + " bf16=True,\n", + " push_to_hub=False,\n", + ")\n", + "\n", + "from transformers import Seq2SeqTrainer\n", + "\n", + "trainer = Seq2SeqTrainer(\n", + " model,\n", + " args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=tokenized_datasets[\"validation\"],\n", + " data_collator=data_collator,\n", + " tokenizer=tokenizer,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "\n", + "# Train the model\n", + "trainer.train()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/translation/t5/3.produce_test_predictions.ipynb b/translation/t5/3.produce_test_predictions.ipynb new file mode 100644 index 0000000..6c773c4 --- /dev/null +++ b/translation/t5/3.produce_test_predictions.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Goal: end to end inference and evaluation\n", + "\n", + "given a csv, make predictions and evaluate predictions, then return results in a csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The mode has been set to: tn_td_unit t5-base\n", + "Using model checkpoint: train_1_t5-base_tn_td_unit_80/checkpoint-3840\n", + "Columns in df_org:\n", + "['thing', 'property', 'ships_idx', 'tag_name', 'tag_description', 'signal_type', 'min', 'max', 'unit', 'data_type', 'thing_pattern', 'property_pattern', 'pattern', 'MDM', 'org_tag_description']\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "import json\n", + "\n", + "# Read the mode from the JSON file\n", + "with open(\"mode.json\", \"r\") as json_file:\n", + " mode_dict = json.load(json_file)\n", + "\n", + "\n", + "# Set the mode variable from the JSON content\n", + "mode = mode_dict.get(\"mode\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", + "model_name = mode_dict.get(\"model\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", + "train_epochs = mode_dict.get(\"train_epochs\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", + "fold_group = mode_dict.get(\"fold_group\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", + "\n", + "print(f\"The mode has been set to: {mode} {model_name}\")\n", + "\n", + "# Define the base directory where checkpoints are stored\n", + "base_dir = f\"train_{fold_group}_{model_name}_{mode}_{train_epochs}\"\n", + "\n", + "# List all subdirectories in the base directory\n", + "subdirectories = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]\n", + "\n", + "# Filter for checkpoint directories that match the pattern \"checkpoint-\"\n", + "checkpoints = [d for d in subdirectories if d.startswith(\"checkpoint-\")]\n", + "\n", + "# Select the latest checkpoint (the one with the highest number)\n", + "if checkpoints:\n", + " latest_checkpoint = checkpoints[0]\n", + " model_checkpoint = os.path.join(base_dir, latest_checkpoint)\n", + " print(f\"Using model checkpoint: {model_checkpoint}\")\n", + "else:\n", + " print(\"No checkpoints were found.\")\n", + " model_checkpoint = None # Handle this case as needed\n", + "\n", + "# Load the data\n", + "data_path = f\"../../data_preprocess/dataset/{fold_group}/test.csv\" # Adjust the CSV file path as necessary\n", + "\n", + "try:\n", + " df = pd.read_csv(data_path)\n", + "except UnicodeDecodeError:\n", + " df = pd.read_csv(data_path, encoding='ISO-8859-1')\n", + "\n", + "\n", + "# Drop rows where 'tag_description' is NaN and reset the index\n", + "df = df.dropna(subset=['tag_description']).reset_index(drop=True)\n", + "\n", + "# Preserve df_org\n", + "df_org = df.copy()\n", + "\n", + "# Print the column names of df_org\n", + "print(\"Columns in df_org:\")\n", + "print(df_org.columns.tolist())\n", + "\n", + "selected_columns = ['thing', 'property', 'tag_description', 'min', 'max', 'MDM', 'pattern']\n", + "df[selected_columns] = df[selected_columns].astype(\"string\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The test_dataset contains 14718 items.\n" + ] + } + ], + "source": [ + "from datasets import Dataset\n", + "\n", + "def process_df(df, mode='only_td'):\n", + " output_list = []\n", + " for _, row in df.iterrows():\n", + " try:\n", + " if mode == 'only_td':\n", + " input_str = f\"{str(row['tag_description'])}\"\n", + " elif mode == 'tn_td':\n", + " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}\"\n", + " elif mode == 'tn_td_min_max':\n", + " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}{row['min']}{row['max']}\"\n", + " elif mode == 'td_min_max':\n", + " input_str = f\"{str(row['tag_description'])}{row['min']}{row['max']}\" \n", + " elif mode == 'td_unit':\n", + " input_str = f\"{str(row['tag_description'])}{str(row['unit'])}\" \n", + " elif mode == 'tn_td_unit':\n", + " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}{str(row['unit'])}\" \n", + " else:\n", + " raise ValueError(\"Invalid mode specified\")\n", + "\n", + " output_list.append({\n", + " 'translation': {\n", + " 'ships_idx': row['ships_idx'],\n", + " 'input': input_str,\n", + " 'thing_property': f\"{row['thing']}{row['property']}\",\n", + " 'answer_thing': f\"{row['thing']}\",\n", + " 'answer_property': f\"{row['property']}\",\n", + " 'MDM': f\"{row['MDM']}\",\n", + " }\n", + " })\n", + " except Exception as e:\n", + " print(f\"Error processing row: {row}\")\n", + " print(f\"Exception: {e}\")\n", + " return output_list\n", + "\n", + "\n", + "# Process the DataFrame\n", + "processed_data = process_df(df, mode=mode)\n", + "\n", + "# Create a Dataset object\n", + "test_dataset = Dataset.from_list(processed_data)\n", + "\n", + "# Print the number of items in the dataset\n", + "print(f\"The test_dataset contains {len(test_dataset)} items.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers.pipelines.pt_utils import KeyDataset\n", + "from transformers import pipeline\n", + "from tqdm import tqdm\n", + "import os\n", + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors=\"pt\")\n", + "# Define additional special tokens\n", + "# additional_special_tokens = [\"\", \"\", \"\", \"\"]\n", + "additional_special_tokens = [\"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\"]\n", + "\n", + "# Add the additional special tokens to the tokenizer\n", + "tokenizer.add_special_tokens({\"additional_special_tokens\": additional_special_tokens})\n", + "# tokenizer.add_special_tokens({'sep_token': \"\"})\n", + "\n", + "\n", + "pipe = pipeline(\"translation_XX_to_YY\", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=128, device=0)\n", + "\n", + "# check what token-ids the special tokens are\n", + "# tokenizer.encode(\"\")\n", + "\n", + "def extract_seq(tokens, start_value, end_value):\n", + " if start_value not in tokens or end_value not in tokens:\n", + " return None # Or handle this case according to your requirements\n", + " start_id = tokens.index(start_value)\n", + " end_id = tokens.index(end_value)\n", + "\n", + " return tokens[start_id+1:end_id]\n", + "\n", + "# problem, what if end tokens are not in?\n", + "def process_tensor_output(output):\n", + " tokens = output[0]['translation_token_ids'].tolist()\n", + " thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = \n", + " property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = \n", + " p_thing = None\n", + " p_property = None\n", + " if (thing_seq is not None):\n", + " p_thing = tokenizer.decode(thing_seq)\n", + " if (property_seq is not None):\n", + " p_property = tokenizer.decode(property_seq)\n", + " return p_thing, p_property" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "making inference on test set\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "14718it [00:44, 330.24it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "inference done\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "p_thing_list = []\n", + "p_property_list = []\n", + "print(\"making inference on test set\")\n", + "for out in tqdm(pipe(KeyDataset(test_dataset[\"translation\"], \"input\"), batch_size=256)):\n", + " p_thing, p_property = process_tensor_output(out)\n", + " p_thing_list.append(p_thing)\n", + " p_property_list.append(p_property)\n", + "print(\"inference done\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Thing prediction accuracy: 0.9895314057826521\n", + "Correct thing predictions: 1985, Incorrect thing predictions: 21\n", + "Property prediction accuracy: 0.9661016949152542\n", + "Correct property predictions: 1938, Incorrect property predictions: 12780\n", + "total accuracy: 0.9596211365902293\n", + "Correct total predictions: 1925, Incorrect total predictions: 81\n" + ] + } + ], + "source": [ + "answer_thing = [item['answer_thing'] for item in test_dataset[\"translation\"]]\n", + "answer_property = [item['answer_property'] for item in test_dataset[\"translation\"]]\n", + "mdm_list = [item['MDM'] for item in test_dataset[\"translation\"]]\n", + "\n", + "mdm_count = 0\n", + "for i in range(len(mdm_list)):\n", + " if(mdm_list[i] == \"True\"):mdm_count = mdm_count + 1 \n", + "\n", + "def correctness_test(input, reference, mdm_list):\n", + " assert(len(input) == len(reference))\n", + " correctness_list = []\n", + " for i in range(len(input)):\n", + " if(mdm_list[i] == \"True\"):\n", + " correctness_list.append(input[i] == reference[i])\n", + " else:correctness_list.append(False)\n", + " return correctness_list\n", + "\n", + "# Compare with answer to evaluate correctness\n", + "thing_correctness = correctness_test(p_thing_list, answer_thing, mdm_list)\n", + "property_correctness = correctness_test(p_property_list, answer_property, mdm_list)\n", + "\n", + "correctness_mdm = []\n", + "for i in range(len(mdm_list)):\n", + " if(thing_correctness[i] & property_correctness[i]):\n", + " correctness_mdm.append(True)\n", + " else: \n", + " correctness_mdm.append(False)\n", + " \n", + " \n", + "# Calculate accuracy\n", + "thing_accuracy = sum(thing_correctness) / mdm_count\n", + "property_accuracy = sum(property_correctness) / mdm_count\n", + "total_accuracy = sum(correctness_mdm) / mdm_count\n", + "\n", + "# Count True/False values\n", + "thing_true_count = thing_correctness.count(True)\n", + "thing_false_count = 0\n", + "for i in range(len(thing_correctness)):\n", + " if mdm_list[i] == \"True\" and thing_correctness[i] == False:\n", + " thing_false_count += 1\n", + "\n", + "property_true_count = property_correctness.count(True)\n", + "property_false_count = property_correctness.count(False)\n", + "total_true_count = correctness_mdm.count(True)\n", + "total_false_count = mdm_count - correctness_mdm.count(True)\n", + "\n", + "# Print results\n", + "print(\"Thing prediction accuracy:\", thing_accuracy)\n", + "print(f\"Correct thing predictions: {thing_true_count}, Incorrect thing predictions: {thing_false_count}\")\n", + "print(\"Property prediction accuracy:\", property_accuracy)\n", + "print(f\"Correct property predictions: {property_true_count}, Incorrect property predictions: {property_false_count}\")\n", + "print(\"total accuracy:\", total_accuracy)\n", + "print(f\"Correct total predictions: {total_true_count}, Incorrect total predictions: {total_false_count}\")\n", + "\n", + "# Create a DataFrame with the results\n", + "dict = {\n", + " 'p_thing': p_thing_list,\n", + " 'p_property': p_property_list,\n", + " 'p_thing_correct': thing_correctness,\n", + " 'p_property_correct': property_correctness\n", + "}\n", + "\n", + "df_pred = pd.DataFrame(dict)\n", + "\n", + "# Read the mode from the JSON file\n", + "with open(\"mode.json\", \"r\") as json_file:\n", + " mode_dict = json.load(json_file)\n", + "\n", + "# Add the model key to the dictionary\n", + "mode_dict[\"model\"] = model_name\n", + "mode_dict[\"train_epochs\"] = train_epochs\n", + "\n", + "# Save the updated dictionary back to the JSON file\n", + "with open(\"mode.json\", \"w\") as json_file:\n", + " json.dump(mode_dict, json_file)\n", + "\n", + "\n", + "# Check if the file exists and is not empty\n", + "if os.path.exists(\"results.json\") and os.path.getsize(\"results.json\") > 0:\n", + " # Read the existing results.json file\n", + " with open(\"results.json\", \"r\") as json_file:\n", + " try:\n", + " results_dict = json.load(json_file)\n", + " except json.JSONDecodeError:\n", + " results_dict = {}\n", + "else:\n", + " results_dict = {}\n", + "\n", + "# Add the new model_checkpoint key with the accuracy values as an object\n", + "\n", + "model_key = model_checkpoint \n", + "\n", + "results_dict[model_key] = {\n", + " \"thing_accuracy\": thing_accuracy,\n", + " \"thing_true\": thing_true_count,\n", + " \"thing_false\": thing_false_count,\n", + " \"property_accuracy\": property_accuracy,\n", + " \"property_true\": property_true_count,\n", + " \"property_false\": property_false_count,\n", + " \"total_accuracy\": total_accuracy,\n", + " \"total_true\": total_true_count,\n", + " \"total_false\": total_false_count \n", + "}\n", + "\n", + "# Save the updated dictionary back to the results.json file\n", + "with open(\"results.json\", \"w\") as json_file:\n", + " json.dump(results_dict, json_file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated data saved to ../0.result/1/test_p.csv\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "# Create a DataFrame with the results\n", + "df_pred = pd.DataFrame({\n", + " 'p_thing': p_thing_list,\n", + " 'p_property': p_property_list,\n", + " 'p_thing_correct': thing_correctness,\n", + " 'p_property_correct': property_correctness,\n", + "})\n", + "\n", + "# Merge predictions with the original DataFrame (df_org)\n", + "df_org['p_thing'] = df_pred['p_thing']\n", + "df_org['p_property'] = df_pred['p_property']\n", + "df_org['p_thing_correct'] = df_pred['p_thing_correct']\n", + "df_org['p_property_correct'] = df_pred['p_property_correct']\n", + "df_org['p_correct'] = df_pred['p_thing_correct'] & df_org['p_property_correct']\n", + "\n", + "df_master = pd.read_csv('../../data_import/data_model_master_export.csv')\n", + "\n", + "df_org['pattern'] = df_org['thing'].str.replace(r'\\d', '#', regex=True) + \" \" + df_org['property'].str.replace(r'\\d', '#', regex=True)\n", + "df_org['p_pattern'] = df_org['p_thing'].str.replace(r'\\d', '#', regex=True) + \" \" + df_org['p_property'].str.replace(r'\\d', '#', regex=True)\n", + "df_master['master_pattern'] = df_master['thing'] + \" \" + df_master['property']\n", + "\n", + "# Create a set of unique patterns from master for fast lookup\n", + "master_patterns = set(df_master['master_pattern'])\n", + "df_org['p_MDM'] = df_org['p_pattern'].apply(lambda x: x in master_patterns)\n", + "\n", + "\n", + "output_path = f\"../0.result/{fold_group}/test_p.csv\"\n", + "debug_output_path = f\"0.dresult/{fold_group}/test_p.csv\"\n", + "\n", + "# 폴더가 없으면 생성\n", + "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", + "df_org.to_csv(output_path, index=False, encoding='utf-8-sig')\n", + "\n", + "os.makedirs(os.path.dirname(debug_output_path), exist_ok=True)\n", + "df_org.to_csv(debug_output_path, index=False, encoding='utf-8-sig')\n", + "\n", + "print(f\"Updated data saved to {output_path}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}