{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Goal: end to end inference and evaluation\n", "\n", "given a csv, make predictions and evaluate predictions, then return results in a csv" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The mode has been set to: tn_td_unit t5-base\n", "Using model checkpoint: train_1_t5-base_tn_td_unit_80/checkpoint-3840\n", "Columns in df_org:\n", "['thing', 'property', 'ships_idx', 'tag_name', 'tag_description', 'signal_type', 'min', 'max', 'unit', 'data_type', 'thing_pattern', 'property_pattern', 'pattern', 'MDM', 'org_tag_description']\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "import json\n", "\n", "# Read the mode from the JSON file\n", "with open(\"mode.json\", \"r\") as json_file:\n", " mode_dict = json.load(json_file)\n", "\n", "\n", "# Set the mode variable from the JSON content\n", "mode = mode_dict.get(\"mode\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", "model_name = mode_dict.get(\"model\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", "train_epochs = mode_dict.get(\"train_epochs\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", "fold_group = mode_dict.get(\"fold_group\", \"none\") # 'default_value' is a fallback if 'mode' is not found\n", "\n", "print(f\"The mode has been set to: {mode} {model_name}\")\n", "\n", "# Define the base directory where checkpoints are stored\n", "base_dir = f\"train_{fold_group}_{model_name}_{mode}_{train_epochs}\"\n", "\n", "# List all subdirectories in the base directory\n", "subdirectories = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]\n", "\n", "# Filter for checkpoint directories that match the pattern \"checkpoint-\"\n", "checkpoints = [d for d in subdirectories if d.startswith(\"checkpoint-\")]\n", "\n", "# Select the latest checkpoint (the one with the highest number)\n", "if checkpoints:\n", " latest_checkpoint = checkpoints[0]\n", " model_checkpoint = os.path.join(base_dir, latest_checkpoint)\n", " print(f\"Using model checkpoint: {model_checkpoint}\")\n", "else:\n", " print(\"No checkpoints were found.\")\n", " model_checkpoint = None # Handle this case as needed\n", "\n", "# Load the data\n", "data_path = f\"../../data_preprocess/dataset/{fold_group}/test.csv\" # Adjust the CSV file path as necessary\n", "\n", "try:\n", " df = pd.read_csv(data_path)\n", "except UnicodeDecodeError:\n", " df = pd.read_csv(data_path, encoding='ISO-8859-1')\n", "\n", "\n", "# Drop rows where 'tag_description' is NaN and reset the index\n", "df = df.dropna(subset=['tag_description']).reset_index(drop=True)\n", "\n", "# Preserve df_org\n", "df_org = df.copy()\n", "\n", "# Print the column names of df_org\n", "print(\"Columns in df_org:\")\n", "print(df_org.columns.tolist())\n", "\n", "selected_columns = ['thing', 'property', 'tag_description', 'min', 'max', 'MDM', 'pattern']\n", "df[selected_columns] = df[selected_columns].astype(\"string\")\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The test_dataset contains 14718 items.\n" ] } ], "source": [ "from datasets import Dataset\n", "\n", "def process_df(df, mode='only_td'):\n", " output_list = []\n", " for _, row in df.iterrows():\n", " try:\n", " if mode == 'only_td':\n", " input_str = f\"{str(row['tag_description'])}\"\n", " elif mode == 'tn_td':\n", " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}\"\n", " elif mode == 'tn_td_min_max':\n", " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}{row['min']}{row['max']}\"\n", " elif mode == 'td_min_max':\n", " input_str = f\"{str(row['tag_description'])}{row['min']}{row['max']}\" \n", " elif mode == 'td_unit':\n", " input_str = f\"{str(row['tag_description'])}{str(row['unit'])}\" \n", " elif mode == 'tn_td_unit':\n", " input_str = f\"{str(row['tag_name'])}{str(row['tag_description'])}{str(row['unit'])}\" \n", " else:\n", " raise ValueError(\"Invalid mode specified\")\n", "\n", " output_list.append({\n", " 'translation': {\n", " 'ships_idx': row['ships_idx'],\n", " 'input': input_str,\n", " 'thing_property': f\"{row['thing']}{row['property']}\",\n", " 'answer_thing': f\"{row['thing']}\",\n", " 'answer_property': f\"{row['property']}\",\n", " 'MDM': f\"{row['MDM']}\",\n", " }\n", " })\n", " except Exception as e:\n", " print(f\"Error processing row: {row}\")\n", " print(f\"Exception: {e}\")\n", " return output_list\n", "\n", "\n", "# Process the DataFrame\n", "processed_data = process_df(df, mode=mode)\n", "\n", "# Create a Dataset object\n", "test_dataset = Dataset.from_list(processed_data)\n", "\n", "# Print the number of items in the dataset\n", "print(f\"The test_dataset contains {len(test_dataset)} items.\")\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from transformers.pipelines.pt_utils import KeyDataset\n", "from transformers import pipeline\n", "from tqdm import tqdm\n", "import os\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors=\"pt\")\n", "# Define additional special tokens\n", "# additional_special_tokens = [\"\", \"\", \"\", \"\"]\n", "additional_special_tokens = [\"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\"]\n", "\n", "# Add the additional special tokens to the tokenizer\n", "tokenizer.add_special_tokens({\"additional_special_tokens\": additional_special_tokens})\n", "# tokenizer.add_special_tokens({'sep_token': \"\"})\n", "\n", "\n", "pipe = pipeline(\"translation_XX_to_YY\", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=128, device=0)\n", "\n", "# check what token-ids the special tokens are\n", "# tokenizer.encode(\"\")\n", "\n", "def extract_seq(tokens, start_value, end_value):\n", " if start_value not in tokens or end_value not in tokens:\n", " return None # Or handle this case according to your requirements\n", " start_id = tokens.index(start_value)\n", " end_id = tokens.index(end_value)\n", "\n", " return tokens[start_id+1:end_id]\n", "\n", "# problem, what if end tokens are not in?\n", "def process_tensor_output(output):\n", " tokens = output[0]['translation_token_ids'].tolist()\n", " thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = \n", " property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = \n", " p_thing = None\n", " p_property = None\n", " if (thing_seq is not None):\n", " p_thing = tokenizer.decode(thing_seq)\n", " if (property_seq is not None):\n", " p_property = tokenizer.decode(property_seq)\n", " return p_thing, p_property" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "making inference on test set\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "14718it [00:44, 330.24it/s] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "inference done\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "p_thing_list = []\n", "p_property_list = []\n", "print(\"making inference on test set\")\n", "for out in tqdm(pipe(KeyDataset(test_dataset[\"translation\"], \"input\"), batch_size=256)):\n", " p_thing, p_property = process_tensor_output(out)\n", " p_thing_list.append(p_thing)\n", " p_property_list.append(p_property)\n", "print(\"inference done\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Thing prediction accuracy: 0.9895314057826521\n", "Correct thing predictions: 1985, Incorrect thing predictions: 21\n", "Property prediction accuracy: 0.9661016949152542\n", "Correct property predictions: 1938, Incorrect property predictions: 12780\n", "total accuracy: 0.9596211365902293\n", "Correct total predictions: 1925, Incorrect total predictions: 81\n" ] } ], "source": [ "answer_thing = [item['answer_thing'] for item in test_dataset[\"translation\"]]\n", "answer_property = [item['answer_property'] for item in test_dataset[\"translation\"]]\n", "mdm_list = [item['MDM'] for item in test_dataset[\"translation\"]]\n", "\n", "mdm_count = 0\n", "for i in range(len(mdm_list)):\n", " if(mdm_list[i] == \"True\"):mdm_count = mdm_count + 1 \n", "\n", "def correctness_test(input, reference, mdm_list):\n", " assert(len(input) == len(reference))\n", " correctness_list = []\n", " for i in range(len(input)):\n", " if(mdm_list[i] == \"True\"):\n", " correctness_list.append(input[i] == reference[i])\n", " else:correctness_list.append(False)\n", " return correctness_list\n", "\n", "# Compare with answer to evaluate correctness\n", "thing_correctness = correctness_test(p_thing_list, answer_thing, mdm_list)\n", "property_correctness = correctness_test(p_property_list, answer_property, mdm_list)\n", "\n", "correctness_mdm = []\n", "for i in range(len(mdm_list)):\n", " if(thing_correctness[i] & property_correctness[i]):\n", " correctness_mdm.append(True)\n", " else: \n", " correctness_mdm.append(False)\n", " \n", " \n", "# Calculate accuracy\n", "thing_accuracy = sum(thing_correctness) / mdm_count\n", "property_accuracy = sum(property_correctness) / mdm_count\n", "total_accuracy = sum(correctness_mdm) / mdm_count\n", "\n", "# Count True/False values\n", "thing_true_count = thing_correctness.count(True)\n", "thing_false_count = 0\n", "for i in range(len(thing_correctness)):\n", " if mdm_list[i] == \"True\" and thing_correctness[i] == False:\n", " thing_false_count += 1\n", "\n", "property_true_count = property_correctness.count(True)\n", "property_false_count = property_correctness.count(False)\n", "total_true_count = correctness_mdm.count(True)\n", "total_false_count = mdm_count - correctness_mdm.count(True)\n", "\n", "# Print results\n", "print(\"Thing prediction accuracy:\", thing_accuracy)\n", "print(f\"Correct thing predictions: {thing_true_count}, Incorrect thing predictions: {thing_false_count}\")\n", "print(\"Property prediction accuracy:\", property_accuracy)\n", "print(f\"Correct property predictions: {property_true_count}, Incorrect property predictions: {property_false_count}\")\n", "print(\"total accuracy:\", total_accuracy)\n", "print(f\"Correct total predictions: {total_true_count}, Incorrect total predictions: {total_false_count}\")\n", "\n", "# Create a DataFrame with the results\n", "dict = {\n", " 'p_thing': p_thing_list,\n", " 'p_property': p_property_list,\n", " 'p_thing_correct': thing_correctness,\n", " 'p_property_correct': property_correctness\n", "}\n", "\n", "df_pred = pd.DataFrame(dict)\n", "\n", "# Read the mode from the JSON file\n", "with open(\"mode.json\", \"r\") as json_file:\n", " mode_dict = json.load(json_file)\n", "\n", "# Add the model key to the dictionary\n", "mode_dict[\"model\"] = model_name\n", "mode_dict[\"train_epochs\"] = train_epochs\n", "\n", "# Save the updated dictionary back to the JSON file\n", "with open(\"mode.json\", \"w\") as json_file:\n", " json.dump(mode_dict, json_file)\n", "\n", "\n", "# Check if the file exists and is not empty\n", "if os.path.exists(\"results.json\") and os.path.getsize(\"results.json\") > 0:\n", " # Read the existing results.json file\n", " with open(\"results.json\", \"r\") as json_file:\n", " try:\n", " results_dict = json.load(json_file)\n", " except json.JSONDecodeError:\n", " results_dict = {}\n", "else:\n", " results_dict = {}\n", "\n", "# Add the new model_checkpoint key with the accuracy values as an object\n", "\n", "model_key = model_checkpoint \n", "\n", "results_dict[model_key] = {\n", " \"thing_accuracy\": thing_accuracy,\n", " \"thing_true\": thing_true_count,\n", " \"thing_false\": thing_false_count,\n", " \"property_accuracy\": property_accuracy,\n", " \"property_true\": property_true_count,\n", " \"property_false\": property_false_count,\n", " \"total_accuracy\": total_accuracy,\n", " \"total_true\": total_true_count,\n", " \"total_false\": total_false_count \n", "}\n", "\n", "# Save the updated dictionary back to the results.json file\n", "with open(\"results.json\", \"w\") as json_file:\n", " json.dump(results_dict, json_file, indent=4)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Updated data saved to ../0.result/1/test_p.csv\n" ] } ], "source": [ "import os\n", "\n", "# Create a DataFrame with the results\n", "df_pred = pd.DataFrame({\n", " 'p_thing': p_thing_list,\n", " 'p_property': p_property_list,\n", " 'p_thing_correct': thing_correctness,\n", " 'p_property_correct': property_correctness,\n", "})\n", "\n", "# Merge predictions with the original DataFrame (df_org)\n", "df_org['p_thing'] = df_pred['p_thing']\n", "df_org['p_property'] = df_pred['p_property']\n", "df_org['p_thing_correct'] = df_pred['p_thing_correct']\n", "df_org['p_property_correct'] = df_pred['p_property_correct']\n", "df_org['p_correct'] = df_pred['p_thing_correct'] & df_org['p_property_correct']\n", "\n", "df_master = pd.read_csv('../../data_import/data_model_master_export.csv')\n", "\n", "df_org['pattern'] = df_org['thing'].str.replace(r'\\d', '#', regex=True) + \" \" + df_org['property'].str.replace(r'\\d', '#', regex=True)\n", "df_org['p_pattern'] = df_org['p_thing'].str.replace(r'\\d', '#', regex=True) + \" \" + df_org['p_property'].str.replace(r'\\d', '#', regex=True)\n", "df_master['master_pattern'] = df_master['thing'] + \" \" + df_master['property']\n", "\n", "# Create a set of unique patterns from master for fast lookup\n", "master_patterns = set(df_master['master_pattern'])\n", "df_org['p_MDM'] = df_org['p_pattern'].apply(lambda x: x in master_patterns)\n", "\n", "\n", "output_path = f\"../0.result/{fold_group}/test_p.csv\"\n", "debug_output_path = f\"0.dresult/{fold_group}/test_p.csv\"\n", "\n", "# 폴더가 없으면 생성\n", "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", "df_org.to_csv(output_path, index=False, encoding='utf-8-sig')\n", "\n", "os.makedirs(os.path.dirname(debug_output_path), exist_ok=True)\n", "df_org.to_csv(debug_output_path, index=False, encoding='utf-8-sig')\n", "\n", "print(f\"Updated data saved to {output_path}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }