hipom_data_mapping/post_process/tfidf_class/5.analysis.ipynb

106 lines
8.3 KiB
Plaintext
Raw Normal View History

2024-09-25 08:52:30 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'p_map'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
"File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'p_map'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 21\u001b[0m\n\u001b[1;32m 18\u001b[0m combined_data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(all_data, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# -1인 s_score 값을 제외하고 s_thing이 null이 아닌 데이터만 필터링\u001b[39;00m\n\u001b[0;32m---> 21\u001b[0m filtered_data \u001b[38;5;241m=\u001b[39m combined_data[(combined_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms_thing\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mnotna() \u001b[38;5;241m&\u001b[39m (\u001b[43mcombined_data\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mp_map\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m))]\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# s_correct가 True인 경우와 False인 경우로 나눔\u001b[39;00m\n\u001b[1;32m 24\u001b[0m true_data \u001b[38;5;241m=\u001b[39m filtered_data[filtered_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms_correct\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m]\n",
"File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
"File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
"\u001b[0;31mKeyError\u001b[0m: 'p_map'"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# 그룹 번호 목록 설정\n",
"group_numbers = [1]\n",
"\n",
"# 데이터를 저장할 리스트 초기화\n",
"all_data = []\n",
"\n",
"# 각 그룹의 데이터를 읽어서 합침\n",
"for group_number in group_numbers:\n",
" file_path = f'../0.result/tfidf/{group_number}/test_s.csv'\n",
" data = pd.read_csv(file_path)\n",
" all_data.append(data)\n",
"\n",
"# 모든 그룹 데이터를 하나의 DataFrame으로 합침\n",
"combined_data = pd.concat(all_data, ignore_index=True)\n",
"\n",
"# -1인 s_score 값을 제외하고 s_thing이 null이 아닌 데이터만 필터링\n",
"filtered_data = combined_data[(combined_data['s_thing'].notna() & (combined_data['p_map'] > 0))]\n",
"\n",
"# s_correct가 True인 경우와 False인 경우로 나눔\n",
"true_data = filtered_data[filtered_data['s_correct'] == True]\n",
"false_data = filtered_data[filtered_data['s_correct'] == False]\n",
"\n",
"# 공통된 bins 설정\n",
"bins = np.linspace(0, 1, 31) # 0부터 1까지 30개의 구간으로 나눔\n",
"\n",
"# 히스토그램 그리기\n",
"plt.figure(figsize=(14, 7))\n",
"\n",
"# s_correct가 True인 경우\n",
"plt.hist(true_data['s_score'], bins=bins, color='green', edgecolor='black', alpha=0.5, label='s_correct=True')\n",
"\n",
"# s_correct가 False인 경우\n",
"plt.hist(false_data['s_score'], bins=bins, color='red', edgecolor='black', alpha=0.5, label='s_correct=False')\n",
"\n",
"# 그래프 제목과 라벨 설정\n",
"plt.title('Distribution of s_score by s_correct (s_thing is not null)', fontsize=20)\n",
"plt.xlabel('s_score', fontsize=16)\n",
"plt.ylabel('Frequency', fontsize=16)\n",
"plt.xticks(fontsize=14)\n",
"plt.yticks(fontsize=14)\n",
"\n",
"# 범례 추가\n",
"plt.legend(fontsize=14)\n",
"\n",
"# 그래프 출력\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}