hipom_data_mapping/post_process/tfidf_class/3.refine.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'p_correct'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'p_correct'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;66;03m# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, row \u001b[38;5;129;01min\u001b[39;00m test_csv\u001b[38;5;241m.\u001b[39miterrows():\n\u001b[0;32m---> 22\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mp_correct\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;129;01mand\u001b[39;00m row[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mctp_correct\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m     23\u001b[0m         update_count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m  \u001b[38;5;66;03m# Increment the counter\u001b[39;00m\n\u001b[1;32m     25\u001b[0m         \u001b[38;5;66;03m# Check for duplicates within the same ships_idx\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/series.py:1121\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   1118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m   1120\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1121\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1123\u001b[0m \u001b[38;5;66;03m# Convert generator to list before going through hashable part\u001b[39;00m\n\u001b[1;32m   1124\u001b[0m \u001b[38;5;66;03m# (We will iterate through the generator there to check for slices)\u001b[39;00m\n\u001b[1;32m   1125\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n",
      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/series.py:1237\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m   1234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m   1236\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1237\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1239\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m   1240\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n",
      "File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3807\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3808\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3809\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3810\u001b[0m     ):\n\u001b[1;32m   3811\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3814\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3815\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3816\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3817\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
      "\u001b[0;31mKeyError\u001b[0m: 'p_correct'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Set the group number\n",
    "group_number = 1  # Change this to the desired group number\n",
    "\n",
    "# Load the CSV files from the specified group\n",
    "sdl_class_rdoc_path = f'0.class_document/{group_number}/sdl_class_rdoc.csv'\n",
    "test_path = f'0.class_document/{group_number}/test_p_c.csv'\n",
    "\n",
    "sdl_class_rdoc_csv = pd.read_csv(sdl_class_rdoc_path, low_memory=False)\n",
    "test_csv = pd.read_csv(test_path, low_memory=False)\n",
    "\n",
    "update_count = 0\n",
    "duplicate_count = 0\n",
    "non_duplicate_count = 0\n",
    "\n",
    "# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\n",
    "for index, row in test_csv.iterrows():\n",
    "    if not row['p_correct'] and row['ctp_correct']:\n",
    "        update_count += 1  # Increment the counter\n",
    "\n",
    "        # Check for duplicates within the same ships_idx\n",
    "        same_idx_rows = test_csv[(test_csv['ships_idx'] == row['ships_idx']) &\n",
    "                                 (test_csv['p_thing'] == row['c_thing']) &\n",
    "                                 (test_csv['p_property'] == row['c_property'])]\n",
    "\n",
    "        if len(same_idx_rows) > 0:\n",
    "            duplicate_count += 1\n",
    "        else:\n",
    "            non_duplicate_count += 1\n",
    "\n",
    "# Print the results\n",
    "print(f\"Total updates where p_correct is False and ctp_correct is True: {update_count}\")\n",
    "print(f\"Number of rows with duplicates in the same ships_idx: {duplicate_count}\")\n",
    "print(f\"Number of rows without duplicates in the same ships_idx: {non_duplicate_count}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of updates made: 45\n",
      "Updated test CSV saved to 0.class_document/1/test_p_c_r.csv\n",
      "Refine CSV saved to refine.csv\n"
     ]
    }
   ],
   "source": [
    "update_count = 0\n",
    "\n",
    "# Initialize a list to hold rows that meet the conditions\n",
    "refine_rows = []\n",
    "\n",
    "# Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met\n",
    "for index, row in test_csv.iterrows():\n",
    "    if (not row['p_MDM'] and row['c_score'] >= 0.9 and \n",
    "        (row['p_thing'] != row['c_thing'] or row['p_property'] != row['c_property'])):\n",
    "        test_csv.at[index, 'p_thing'] = row['c_thing']\n",
    "        test_csv.at[index, 'p_property'] = row['c_property']\n",
    "        test_csv.at[index, 'p_MDM'] = True\n",
    "        update_count += 1  # Increment the counter\n",
    "        refine_rows.append(row)  # Add the row to the refine list\n",
    "\n",
    "# Convert the list of refine rows into a DataFrame\n",
    "refine_df = pd.DataFrame(refine_rows)\n",
    "\n",
    "# Save the refine DataFrame to a CSV file\n",
    "refine_output_path = f'refine.csv'\n",
    "refine_df.to_csv(refine_output_path, index=False, encoding='utf-8-sig')\n",
    "\n",
    "# Print the number of updates made\n",
    "print(f\"Number of updates made: {update_count}\")\n",
    "\n",
    "# Save the updated test CSV\n",
    "output_file_path = f'0.class_document/{group_number}/test_p_c_r.csv'\n",
    "test_csv.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
    "    \n",
    "print(f\"Updated test CSV saved to {output_file_path}\")\n",
    "print(f\"Refine CSV saved to {refine_output_path}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}