hipom_data_mapping/post_process/tfidf_class/2z.plot_distribution.ipynb

269 lines
160 KiB
Plaintext
Raw Normal View History

2024-09-25 08:52:30 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKAAAAGACAYAAACazRotAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAACEzElEQVR4nOzde1xUdf7H8ddwUy6iCHgjFdfAuxXqlma5mmWZlWJlW1m6bpRWm5vbrps/y2q9tLtZluWlNa3MsvWSZSZewworUUoR74U3QAFFQYbbML8/aGYZuQgMw3jo/Xw8ejic8/2e7+ecvgzDh+/FZLVarYiIiIiIiIiIiLiIh7sDEBERERERERGRhk0JKBERERERERERcSkloERERERERERExKWUgBIREREREREREZdSAkpERERERERERFxKCSgREREREREREXEpJaBERERERERERMSllIASERERERERERGX8nJ3AFJziYmJWK1WvL293R2KiIiIiIiIiPxKFRUVYTKZuOaaay5ZViOgDMhqtWK1Wt0dRq1ZrVYKCwsNfQ8ilVH/loZOfVwaMvVvacjUv6WhUx93j5rkJzQCyoBsI5969Ojh5khqJy8vj3379nHllVfi5+fn7nBE6pT6tzR06uPSkKl/S0Om/i0Nnfq4e+zZs6faZTUCSkREREREREREXMowI6BOnDjBTTfdVK2yS5cupU+fPg7H4uPjWbx4Mbt378ZsNtOmTRuGDBlCTEwM/v7+lV7r6NGjzJs3j/j4eM6cOUPz5s3p168fjz/+OG3btq20Xm5uLgsXLiQ2Npa0tDR8fX256qqrGDt2LH379q3eTYuIiIiIiIiINACGSUD5+fkxYsSISs8fPnyYPXv24O/vT7du3RzOLVmyhJkzZ2IymejduzfBwcHs3LmT+fPnExsby7Jly2jevHm5a+7cuZNx48ZhNpuJiIigV69eHDp0iNWrVxMbG8vixYu5+uqry9XLysri/vvvJyUlhdDQUAYOHEhWVhbbtm1j27ZtTJkyhdGjRzv9TEREREREREREjMAwCajmzZsza9asSs8/8sgjANx+++0O8z2Tk5OZNWsWnp6ezJs3jwEDBgBgNpsZP34827dvZ9q0abz++usO1zObzUycOBGz2cyjjz7K008/bT83e/ZsFixYwMSJE1m/fj2NGzd2qDt16lRSUlLo27cv8+bNw9fXF4C4uDjGjx/PjBkz6NOnD507d3buoYiIiIiIiIiIGECDWAPq1KlTfP311wDcfffdDucWLFiA1WolOjrannwC8PX1Zfr06Xh4eBAbG8uRI0cc6q1atYrTp08THh7OxIkTHc5NnDiR8PBw0tLS+OSTTxzOHT58mM2bN+Pp6cn06dPtySeAAQMGMGLECEpKSli4cGEd3LmIiIiIiIiIyOWvQSSgVq1aRUlJCREREVx11VX244WFhcTFxQEwbNiwcvXCwsKIiooCYNOmTQ7nbF/ffvvteHg4PiYPDw+GDh0KwMaNGx3O2b6OiooiLCysXJu2OLZu3UpRUVH1b1JERERERERExKAaRAJq9erVQPnRTykpKZjNZgC6d+9eYV3b8eTkZIfjtq9rWm/fvn1V1uvRowdQukXk0aNHKywjIiIiIiIiItKQGD4B9f3333P06FG8vb258847Hc6dOHECgMDAQAICAiqs37p1a4eyULqDXXZ2NgBt2rSpst6ZM2fIy8sr16bt/MUCAgLssZRtU0RERERERESkoTLMIuSVWblyJQCDBg0qt5PdhQsXABzWYbqYbcHy3NzccvWqqlt2ofPc3Fz717a6Zc9XVDc3N9ehzZqyWq0OiS8jsY1Ks/0r0pCof0tDpz4uDZn6tzRk6t/S0KmPu4fVasVkMlWrrKETULm5ucTGxgIwcuRIN0dTv4qKiuzT/YwqJSXF3SGIuIz6tzR06uPSkKl/S0Om/i0Nnfp4/fPx8alWOUMnoD7//HPMZjOtWrXihhtuKHfe398fqDoDahtFVHaKnq1eVXXLjj6qqG5Vo5MqarOmvL29ufLKK2td353MZjMpKSmEh4dXOTpNxIjUv6WhUx+Xhkz9Wxoy9W9p6NTH3ePw4cPVLmvoBJRt+t2IESPK7VQH2HehO3/+PLm5uRUmfNLS0hzKQmliqFmzZmRnZ5Oamkrnzp0rrRcUFOQw3S4sLIy9e/faz1+s7NS7inbJqy6TyVTlNL/qsFqtFBUVUVJS4tR1asr2/8rDw6PC/28iRmbr040bN3b6e1Tkcubr66s+LlU6duwYmZmZ7g6jRsxmM9nZ2XTp0kX9WxosvX9LQ6c+Xr+qO/0ODJyAOnz4MD/++CMmk6nS6XcdOnTA19cXs9lMUlIS1113XbkySUlJAHTr1s3heNeuXYmPjycpKYlBgwbVqN6GDRvs5y+2Z88eoHQdqPDw8Kpv0kUsFguZmZnk5ORQVFRU7+2XlJTg5eVFamqqElDS4JSUlODp6cmxY8cICAggMDCQpk2bujssEZF6dezYMbp07kyeAdfhaNyoET/8+COdOnVydygiIiINimETUCtWrADg2muvpW3bthWW8fHxYcCAAaxfv561a9eWS0CdPHmSxMREAAYPHuxwbvDgwcTHx/P555/zxBNPOCRKSkpKWLduHQA333xzuXqvvfYau3btIjU1tdwuemvXrgVg4MCBeHt71/S2nWaxWDh+/DgFBQU0bdqUgIAAPD09a5S1rIsYCgoKaNSoEZ6envXWrkh9sFgsmM1m+0YBqampmM1mWrZsWa/fZyIi7pSZmUme2czSESPoEhrq7nCqbU96OmM+/ZTMzEwloEREROqYIRNQRUVFfPrppwDcfffdVZaNiYkhNjaWVatWccstt3DjjTcCpUOsp0yZgsViYciQIXTs2NGhXnR0NPPnzyclJYU5c+bw5z//2X5uzpw5pKSk0KpVK4YPH+5QLyIigptuuonNmzczZcoU5s2bR+PGjQGIi4tj9erVeHh4EBMT4+xjqJXMzEwKCgpo166d2+bFWiwWoHSKkhJQ0tCU7d+hoaGcPXuW9PR0fHx8yu3UKSLS0HUJDSWqdWt3h1FtxcXF7g5BRESkwTJkAurLL78kKyuLwMBAbrnllirLduvWjcmTJzNz5kxiYmLo06cPwcHBJCQkkJGRQYcOHZg2bVq5er6+vrz22muMGzeO+fPns2XLFiIiIjh06BAHDx7Ez8+POXPm2JNLZb300kscOXKE+Ph4Bg8eTO/evcnKymLHjh1YrVamTJlS4bpSrma1WsnJyaFp06ZalE2kngQFBXHhwgWys7MJCgrSKCgREREREflVMuQCPLbFx4cNG0ajRo0uWX7MmDEsXryY/v37c/DgQTZv3oy/vz+PPvooK1asqHRUQq9evVizZg3Dhw8nOzubDRs2kJ2dzfDhw1mzZg1XX311hfWCg4NZuXIlMTEx+Pv7s3nzZg4ePEj//v1ZsmQJDz30UK3v3RlFRUUUFRU5tfueiNRc06ZNKSgo0F/WRURERETkV8uQI6Dmz59f4zr9+vWjX79+Na7Xvn17Xn755RrXCwgIYNKkSUyaNKnGdV3Fttudpr2J1C8vr9K3WovF4pa130RERERERNzNkAkocY6mAInUL33PiYiIiIi4Xnp6Ovn5+YZbciYkJIR27dq5OwyXUwJKRERERERERAzt+PHjjLx7JAX5Be4OpcZ8/XzZv29/g09CKQElIiIiIiIiIoaWmZlJQX4Bd/7tTlr9ppW7w6m2jKMZrJ6xmszMTCWgRERERERERESMIKRdCK0jW7s7DKmAIXfBExERERERERER41ACSkREREREREREXEpT8ETqSWFhIRs2bGDbtm3s3r2bs2fPkpubS0BAAGFhYfTo0YMhQ4Zw3XXX4eGh3LCIiIiIiIg0HEpASaWOHTtGZmZmnV/XYrFQWFiIj48Pnp6edX796qjvbS43bNjArFmzOHnyZLlz2dnZZGdns3fvXj766CPCw8P5+9//zu9+97t6i8/IOnXqBMATTzzBk08+6eZoREREREREpCJKQEmFjh07RucunTHnmd0dikvU5zaXb775Jq+//rr
"text/plain": [
"<Figure size 1200x400 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
"# 그룹 번호 리스트 설정 (1부터 5까지)\n",
"group_numbers = range(1, 6)\n",
"\n",
"# 빈 데이터프레임 생성\n",
"combined_data = pd.DataFrame()\n",
"\n",
"# 그룹별 CSV 파일 읽고 데이터 합치기\n",
"for group_number in group_numbers:\n",
" file_path = f'0.class_document/knn_tfidf/{group_number}/test_p_c.csv'\n",
" data = pd.read_csv(file_path)\n",
" combined_data = pd.concat([combined_data, data], ignore_index=True)\n",
"\n",
"# p_MDM이 True인 항목 필터링\n",
"filtered_data = combined_data\n",
"\n",
"# ctp_correct가 True인 경우와 False인 경우의 c_score 추출\n",
"correct_scores = filtered_data[filtered_data['ctp_correct'] == True]['c_score']\n",
"incorrect_scores = filtered_data[filtered_data['ctp_correct'] == False]['c_score']\n",
"\n",
"# 시각화 스타일 설정\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"# 0.0에서 1.0까지 0.2 간격으로 bin을 설정\n",
"bins = np.arange(0, 1.1, 0.05)\n",
"\n",
"# 플롯 크기 설정\n",
"plt.figure(figsize=(12, 4))\n",
"\n",
"# alpha 값을 낮추고, 두 분포가 겹치도록 설정\n",
"# Correct: 초록색, Incorrect: 빨간색, 테두리 추가\n",
"sns.histplot(correct_scores, bins=bins, kde=False, color='green', alpha=0.5, label='Correct', edgecolor='black')\n",
"sns.histplot(incorrect_scores, bins=bins, kde=False, color='red', alpha=0.5, label='Incorrect', edgecolor='black')\n",
"\n",
"# 그래프 꾸미기\n",
"plt.xlabel('Score', fontsize=18)\n",
"plt.ylabel('Frequency', fontsize=18)\n",
"plt.xticks(fontsize=16)\n",
"plt.yticks(fontsize=16)\n",
"\n",
"# 범례 폰트 크기를 키우고 위치 설정\n",
"plt.legend(fontsize=20, loc='upper left')\n",
"\n",
"# 레이아웃 조정 및 출력\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKAAAAGACAYAAACazRotAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2IUlEQVR4nO3de1jUZf7/8dcMBwEBRcATaboKisdE3dRM10PampbSlq5l2Vq0miW/tXYr17JaD+1Wamt5aM1Ks2w9ZJmFx7TCTNRUwHPiGQUUFRlggPn94c58mTgIDsMw+HxcV1cw931/7vfn0z3EvLkPBovFYhEAAAAAAADgJEZXBwAAAAAAAICajQQUAAAAAAAAnIoEFAAAAAAAAJyKBBQAAAAAAACcigQUAAAAAAAAnIoEFAAAAAAAAJyKBBQAAAAAAACcigQUAAAAAAAAnMrT1QGg6uzevVsWi0VeXl6uDgUAAAAAALg5s9ksg8GgTp06XbcuM6BuIhaLRRaLxdVh3DCLxaK8vDy3vgegKMY0aiLGNWoixjVqIsY1aiLGddWrSJ6BGVA3EevMp/bt27s4khuTnZ2t/fv3q2XLlvLz83N1OIDDGNOoiRjXqIkY16iJGNeoiRjXVW/fvn3lrssMKAAAAAAAADgVCSgAAAAAAAA4FQkoAAAAAAAAOBUJKAAAAAAAADgVCSgAAAAAAAA4FQkoAAAAAAAAOBUJKAAAAAAAADgVCSgAAAAAAAA4FQkoAAAAAAAAOBUJKAAAAAAAADgVCSgAAAAAAAA4laerA4D7sVgsMpvNKiwsrNJ+c3Nzbf82Gsmdwv39ekwbjUZ5enoyvgEAAOByJ06cUHp6uqvDqBCTyaTMzExFRka6OhSUgAQUyq2goEDp6em6cuWKzGZzlfdfWFgoT09PnTlzhg/oqBFKGtNGo1F+fn4KDAxUnTp1XBwhAAAAbkYnTpxQZOvWyjaZXB1KhfnUqqWf9+xRq1atXB0KfoUEFMqloKBAJ0+eVG5ururUqSN/f395eHjIYDBUaQy5ubmqVauWPDw8qqxfwFmKjmmj0ajCwkLl5OQoKytLZ86ckclkUoMGDar0fQYAAACkp6cr22TSkmHDFBka6upwym1faqpGf/GF0tPTSUBVQySgUC7p6enKzc1V06ZN5evr65IYCgoKJEk+Pj4koFAjlDSma9eureDgYF28eFGpqany9vZWvXr1XBkmAAAAblKRoaGKatTI1WGUW35+vqtDQBlYx4TrslgsunLliurUqeOy5BNwswkKClJAQIAyMzNlsVhcHQ4AAAAAOIQEFK7LbDbLbDbL39/f1aEAN5U6deooNzeXv+QAAAAAcHskoHBd1tPuWPYGVC1Pz2urpK1L9QAAAADAXZGAQrmxETJQtXjPAQAAAKgpSEABAAAAAADAqUhAAQAAAAAAwKlIQAEAAAAAAMCpSEABAAAAAADAqUhAAQAAAAAAwKk8XR0AgLLl5eVp3bp12rp1q/bu3auLFy8qKytL/v7+CgsLU/v27TVw4EB169ZNRiM5ZQAAAABA9UMCCpXuxIkTSk9Pr/TrFhQUKC8vT97e3vLw8Kj065dHSEiImjZtWmX9rVu3TjNmzNDp06eLlWVmZiozM1NJSUn69NNP1axZM73wwgv63e9+V2XxubNWrVpJksaPH6+nn37axdEAAAAAQM3mNgmoX375RT/88IOSkpKUlJSko0ePqqCgQBMmTNC4cePKbBsfH69FixZp7969MplMaty4sQYOHKiYmBjVrl271HbHjx/X3LlzFR8frwsXLqhevXrq0aOHnnrqKTVp0qTUdllZWVqwYIHi4uJ09uxZ+fr6qmPHjnrsscfUvXv3UtsVFhbqs88+04oVK3TkyBFJUsuWLfWHP/xBDz74oAwGw3WekuudOHFCrSNby5RtcnUoTuHr56sD+w9USRLqnXfe0dtvv237/o477lDfvn3VokULBQYG6tKlSzp27Jg2bdqk+Ph4paSkaObMmSSgAAAAAADVjtskoD755BN99NFHFW73wQcfaPr06TIYDOrSpYuCg4O1c+dOzZs3T3FxcVq6dKnq1atXrN3OnTs1ZswYmUwmhYeHq3Pnzjp8+LBWrVqluLg4LVq0SLfddluxdhkZGRo5cqRSUlIUGhqqPn36KCMjQ1u3btXWrVs1adIkjRo1qli7goICxcbGat26dfL19VW3bt0kSdu2bdNLL72k+Ph4zZw5s9ovsUpPT5cp26RhLw5T6K2hlXrtwsJCFRQUyMPDwyXPIe14mlZNW6X09HSnJ6BWrFhhSz4FBwdr1qxZ+u1vf1usXo8ePfTQQw/p0KFDmj59ui5cuODUuAAAAAAAuBFuk4CKiIjQn/70J7Vp00Zt2rTR/PnztXr16jLbJCcna8aMGfLw8NDcuXPVu3dvSZLJZNLYsWO1bds2TZkyxW6WibU8NjZWJpNJTz75pP7yl7/Yyt566y3Nnz9fsbGx+uabb+Tj42PXdvLkyUpJSVH37t01d+5c+fr6SpK2bNmisWPHatq0aeratatat25t127x4sVat26dGjRooI8//tg2w+rkyZMaOXKkvvnmG3Xt2lUPP/zwjT3AKhZ6a6gaRTSq1GsWFhYqPz9fnp6e1T4R54hz587ptddekyT5+flp8eLFatGiRZltIiIitHDhQn355ZdVESIAAAAAABXiNp/iH3jgAf3tb3/TkCFD1KJFi3IlIObPny+LxaLo6Ghb8kmSfH19NXXqVBmNRsXFxeno0aN27VauXKnz58+rWbNmio2NtSuLjY1Vs2bNdPbsWX3++ed2ZUeOHNHGjRvl4eGhqVOn2pJPktS7d28NGzZMhYWFWrBggV27wsJC/ec//5EkPfvss3bL+5o0aaJnn33Wdj+FhYXXvW+4tw8++EAm07UljM8888x1k09WRqNR9913X4llCQkJeu6559S3b1+1b99eXbp00dChQzVz5swyZ01t375drVq1UqtWrbR9+3YVFhZq+fLlGjVqlHr06KHWrVvr+eefr3DdopKSkvTSSy9p4MCB6tSpk2677TYNHDhQL7/8so4dO1auez906JBee+01DRkyRF27dlXbtm11xx13aPTo0Xrvvfd0/vx5W92+ffva9n+SpDlz5tjitv5TUpwAAAAAgBvnNjOgKiovL09btmyRJA0ePLhYeVhYmKKiopSQkKANGzbYfcjfsGGDJOmee+4plugyGo0aNGiQ3n33Xa1fv14jRoywla1fv16SFBUVpbCwsGJ9Dh48WMuXL9fmzZtlNpvl5eUlSdq9e7fS0tLk7e2tgQMHFms3cOBATZo0SefPn9eePXvUqVOnij4OuAmLxaJVq1ZJujb76YEHHnDoeoWFhfrHP/6hjz/+2O71vLw87d+/X/v379fHH3+s2bNn64477ijzWrm5uRozZozi4+Ov22956hYWFur111/Xhx9+KIvFYleWkpKilJQULV++XC+99JKGDx9e4jUKCgr0z3/+s8RrpKenKz09Xdu2bdPRo0c1Y8aM68YNAAAAAHCOGpuASklJsc0iadeuXYl12rVrp4SEBCUnJ9u9bv2+rHZF61nt37+/zHbt27eXJGVnZ+v48eNq2bKlXbvw8HDVqlWrWDsfHx+Fh4crOTlZycnJJKBqsMOHD+vixYuSpM6dO8vf39+h673xxhu25NMtt9yiJ554Qm3atJHJZNKmTZv08ccf68qVK3ryySe1fPnyYktDf32tgwcPqm/fvoqOjlbjxo2Vnp6uq1ev3lDd1157TUuXLpUkde3aVcOGDVOTJk3k4+OjgwcP6sMPP9Thw4f10ksvKSQkRP369SvWz+TJk7VixQpJUmhoqB5++GF16tRJAQEBunDhgvbu3au4uDi7NgsXLpTZbNaQIUMkSX/84x81cuRIuzp16tQpz+MFAAAAAJRTjU1AnTp1SpIUGBhY6of4Ro0a2dWVrp1gl5mZKUlq3Lhxme0uXLig7Oxs+fn52V3HWv5r/v7+8vf3V1ZWlk6dOmVLQF2vnSQ1bNhQycnJdrGi5jlw4IDt67Zt2zp0rYMHD2rRokWSru0R9fH
"text/plain": [
"<Figure size 1200x400 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
"# 그룹 번호 리스트 설정 (1부터 5까지)\n",
"group_numbers = range(1, 6)\n",
"\n",
"# 빈 데이터프레임 생성\n",
"combined_data = pd.DataFrame()\n",
"\n",
"# 그룹별 CSV 파일 읽고 데이터 합치기\n",
"for group_number in group_numbers:\n",
" file_path = f'0.class_document/distilbert/{group_number}/test_p_c.csv'\n",
" data = pd.read_csv(file_path)\n",
" combined_data = pd.concat([combined_data, data], ignore_index=True)\n",
"\n",
"# p_MDM이 True인 항목 필터링\n",
"filtered_data = combined_data\n",
"\n",
"# ctp_correct가 True인 경우와 False인 경우의 c_score 추출\n",
"correct_scores = filtered_data[filtered_data['ctp_correct'] == True]['c_score']\n",
"incorrect_scores = filtered_data[filtered_data['ctp_correct'] == False]['c_score']\n",
"\n",
"# 시각화 스타일 설정\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"# 0.0에서 1.0까지 0.2 간격으로 bin을 설정\n",
"bins = np.arange(0, 1.1, 0.05)\n",
"\n",
"# 플롯 크기 설정\n",
"plt.figure(figsize=(12, 4))\n",
"\n",
"# alpha 값을 낮추고, 두 분포가 겹치도록 설정\n",
"# Correct: 초록색, Incorrect: 빨간색, 테두리 추가\n",
"sns.histplot(correct_scores, bins=bins, kde=False, color='green', alpha=0.5, label='Correct', edgecolor='black')\n",
"sns.histplot(incorrect_scores, bins=bins, kde=False, color='red', alpha=0.5, label='Incorrect', edgecolor='black')\n",
"\n",
"# 그래프 꾸미기\n",
"plt.xlabel('Score', fontsize=18)\n",
"plt.ylabel('Frequency', fontsize=18)\n",
"plt.xticks(fontsize=16)\n",
"plt.yticks(fontsize=16)\n",
"\n",
"# 범례 폰트 크기를 키우고 위치 설정\n",
"plt.legend(fontsize=20, loc='upper left')\n",
"\n",
"# 레이아웃 조정 및 출력\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKAAAAKsCAYAAADbS8X9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAADEsklEQVR4nOzdeXxU9b3/8fckM1kmG2SBRCCigiCLS8CKVqUirYhglatSFST1VyhSWuxtrdrSIrWo9161Uq1FtFYBUdsr7qKCIlUBlUUJq4hECQRIJntmn8zvD25iYvbMnBwm83o+Hj6Ic77nez5zvpnMmfec8z2WYDAYFAAAAAAAAGCQGLMLAAAAAAAAQM9GAAUAAAAAAABDEUABAAAAAADAUARQAAAAAAAAMBQBFAAAAAAAAAxFAAUAAAAAAABDEUABAAAAAADAUARQAAAAAAAAMJTV7AJ6sm3btikYDMpms5ldCgAAAAAAiHA+n08Wi0XnnHOO2aV0GmdAGSgYDCoYDJpdRo8RDAbl9XrZpyZiDMzHGJiL/W8+xsBc7H/zMQbmYwzMxf43H2NgrkjOGTgDykD1Zz6NHDnS5Ep6BqfTqd27d2vQoEGy2+1mlxOVGAPzMQbmYv+bjzEwF/vffIyB+RgDc7H/zccYmGv79u2yWCxml9ElnAEFAAAAAAAAQxFAAQAAAAAAwFAEUAAAAAAAADAUARQAAAAAAAAMRQAFAAAAAAAAQxFAAQAAAAAAwFAEUAAAAAAAADAUARQAAAAAAAAMRQAFAAAAAAAAQxFAAQAAAAAAwFAEUAAAAAAAADAUARQAAAAAAAAMRQAFAAAAAAAAQxFAAQAAAAAAwFAEUAAAAAAAADAUARQAAAAAAAAMRQAFAAAAAAAAQ1nNLqCjvvzyS3344YfauXOndu7cqf379ysQCGjevHmaM2dOl/vdsGGD/vGPf2j79u1yuVw66aSTdNlll2nWrFlKSkoK4zMAAAAAAACIThETQD377LNatmxZWPt86qmndO+998pisWj06NHKyMjQli1btGTJEr311ltauXKl0tPTw7pNAAAAAACAaBMxAdTpp5+um2++WcOGDdOwYcP02GOP6eWXX+5yf7t27dJ9992n2NhY/e1vf9PYsWMlSS6XS7fccos2btyou+66S3/5y1/C9RQAAAAAAACiUsQEUNdee22T/4+JCW36qscee0zBYFBTpkxpCJ8kKTExUYsWLdL48eP11ltvaf/+/TrttNNC2hYAAAAAAEA0i8pJyL1er9avXy9JmjRpUrPl/fr1U15eniRp7dq13VobAAAAAABATxOVAVRhYaFcLpckacSIES22qX98165d3VYXAAAAAABATxSVAVRRUZEkKTU1VcnJyS22ycnJadIWAAAAAAAAXRMxc0CFU21traTj8z21xm63S5JqampC2lYwGJTT6Qypj+5UU1Mjt9ttdhktcrvdqqioUFFRkRISErp12w6HI+TfhZ7A4/GouLhY5eXlio+PN7ucqMQYmIv9bz7GwFwej0eFhYXsfxMZ/RqIj49vOA5Gy9xutw4dOqRgMNjtx6Rg/9dLSUlRVlaWKduuv5qo/l90r2AwKIvFYnYZXRKVAVR38vl82r17t9lldIjT6dS7q1ZJlZVml3JCcbvdeu+D9+Txe8wuBQAAU7l9ATlqapSdmqLY2Kg8kb7Hq4mNlfr0UgzjC5zQ0hLTtOj3i9S7d2/TaigsLDRt29EuLi7O7BK6JCoDqKSkJEltJ7b1Zy21doleR9lsNg0aNCikPrpLaWmpMmw2jRswQGltnB1mFp/Xq/LycvXu3Vu2bnzBlTkcClol6+lJstlt3bbdE1FdXVB+n09Wm00xMZGZukc6xsBc7H/zMQbm+vzrKm3aW6eLB1qV2TfV7HKikpGvAZcvoA99dQpcnClbUnQf87Slrq5ObpdbCYkJId+ZG53H/pfcZW55/u1Rdna2Tj311G7fvsvlUmFhoQYOHNjmVUUwxr59+8wuocuiMoDq16+fJKmqqko1NTUthkzFxcVN2naVxWKJmNOY7Xa74mw29UlLU2aIwZsRPF6vLB6PstPTFd+NAZQtEFCy1arEdLviUyMzaQ6Xuro6eTwexcfHR+0bvtkYA3Ox/83HGJgrqfz4ZfrJafHK6JNkcjXRycjXQI3br7gan6x9UxQX5cc8bQkEArJUW5SckqzY2Fizy4k67H8pNjZWvhifEhMTTf2safb2o1WkXn4nRekk5KecckpDUrtjx44W29Q/Pnz48G6rCwAAAAAAoCeKygAqLi5OY8eOlSS99tprzZYfOnRI27ZtkySNHz++W2sDAAAAAADoaXp0ALVixQpNmDBBv/nNb5otmzVrliwWi1atWqV///vfDY+7XC797ne/UyAQ0GWXXabTTjutO0sGAAAAAADocSJmDqidO3dq4cKFDf//9ddfS5Kef/55vffeew2PP/LII+rTp48kqby8XAcOHGjx9pTDhw/XHXfcoXvvvVezZs3Sueeeq4yMDG3evFklJSU65ZRTdNdddxn6nAAAAAAAAKJBxARQNTU1+uyzz5o9fuTIER05cqTh/71eb4f7zM/P1+mnn64nn3xSBQUFcjqdOumkkzRlyhTNmjUr5DvgAQAAAAAAIIICqPPOO0979+7t1Do///nP9fOf/7zNNhdccIEuuOCCUEoDAAAAAABAG3r0HFAAAAAAAAAwHwEUAAAAAAAADEUABQAAAAAAAEMRQAEAAAAAAMBQBFAAAAAAAAAwFAEUAAAAAAAADEUABQAAAAAAAEMRQAEAAAAAAMBQBFAAAAAAAAAwFAEUAAAAAAAADEUABQAAAAAAAEMRQAEAAAAAAMBQBFAAAAAAAAAwFAEUAAAAAAAADEUABQAAAAAAAEMRQAEAAAAAAMBQBFAAAAAAAAAwFAEUAAAAAAAADGU1uwCceCqcTrNLaJHH61WZ0ylrTY3i4+K6bbuO2lrVBgLylLtk8/i7bbsnomAwKK/Hq7j4OlksFrPLiUqMgbnY/+FT569TMBDs9HrBYFBer1dxcT7GoBsE6upUVxdo+P9Kh1PBYFAVJU5Z6tpfPyYmVrEx5n3faYm1KMbas75vNfLvkMtXJ5+vTp6SWvlcvrD23ZMEAgG5al2yOC2KjY01u5yow/6XXA6X2SUAXUIAhQYJCQmyZmToXYdDcp14f9S8Pp8clZXKsNkUZ7N123ZrXS5tjImTa69LkqfbtnsiqgsGFQj4FRvrVQwf/EzBGJiL/R8edcE61VW7FFfXgQTjW4KSgsE6WSwxYgSMVRcMyuPxKKhvgsIaX1Aui7RhR7lkqWi3D4ssio+PN+314o2JUUxKomIsPSeEMvrvUI3VKqurOmo/2HdEXV2d3B63fPE+xZgYsEYr9v9xGSkZSk1NNbsMoFMIoNAgOTlZ182aJbfbbXYpLXI6ndq3b58GDx4su93erdu+/Kc/VXV1dbdu80TkdrtVWFiogQMHKiEhwexyohJjYC72f3hUVFTo3ytX6sKEBKXEx3dqXb/Pp7KyMqWnp8vajV9GRKPKigq9/vYbiukfI1vC8UPGL4/WalthhUad3ku9eye1ub7P7VddUZ2u+MFEpfXq1Q0VN1Xt8egDt1sX33CDepmwfaMY/XcoPj5eSUltj220c7lc2r9/v0477TQlJiaaXU7UYf8fl5qaqqysLLPLADqFAApNJCcnKzk52ewyWuR0OlVSUqLMzMxuD6AyMzO7dXsnKqfTKbvdrjPOOKPbxwDHMQbmYv+HR2lpqQ68955GZWQos5PvOR6vV0eKi5Wdk9Otl2NHo2MlJdqyIUGJA+2KTz2+r6uDQcV+Xam+OUnql5ve5vqeKq9cFU6NGjxYfUz4kFRaU6OvHA6dc845Pep9nL9D5nM6nfJ4PDr11FMZAxOw/4HIFb3nLAIAAAAAAKBbEEABAAAAAADAUARQAAAAAAAAMBQBFAAAAAAAAAxFAAUAAAAAAABDEUABAAAAAADAUARQAAAAAAAAMBQBFAAAAAAAAAxFAAUAAAAAAABDEUABAAAAAADAUARQAAAAAAAAMBQBFAAAAAAAAAx
"text/plain": [
"<Figure size 1200x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
"# 그룹 번호 리스트 설정 (1부터 5까지)\n",
"group_numbers = range(1, 6)\n",
"\n",
"# 빈 데이터프레임 생성 (TF-IDF와 DistilBERT)\n",
"combined_data_tfidf = pd.DataFrame()\n",
"combined_data_bert = pd.DataFrame()\n",
"\n",
"# TF-IDF 데이터 읽기\n",
"for group_number in group_numbers:\n",
" file_path_tfidf = f'0.class_document/knn_tfidf/{group_number}/test_p_c.csv'\n",
" data_tfidf = pd.read_csv(file_path_tfidf)\n",
" combined_data_tfidf = pd.concat([combined_data_tfidf, data_tfidf], ignore_index=True)\n",
"\n",
"# DistilBERT 데이터 읽기\n",
"for group_number in group_numbers:\n",
" file_path_bert = f'0.class_document/distilbert/{group_number}/test_p_c.csv'\n",
" data_bert = pd.read_csv(file_path_bert)\n",
" combined_data_bert = pd.concat([combined_data_bert, data_bert], ignore_index=True)\n",
"\n",
"# 데이터 필터링 및 점수 추출\n",
"filtered_data_tfidf = combined_data_tfidf\n",
"filtered_data_bert = combined_data_bert\n",
"\n",
"correct_scores_tfidf = filtered_data_tfidf[filtered_data_tfidf['ctp_correct'] == True]['c_score']\n",
"incorrect_scores_tfidf = filtered_data_tfidf[filtered_data_tfidf['ctp_correct'] == False]['c_score']\n",
"\n",
"correct_scores_bert = filtered_data_bert[filtered_data_bert['ctp_correct'] == True]['c_score']\n",
"incorrect_scores_bert = filtered_data_bert[filtered_data_bert['ctp_correct'] == False]['c_score']\n",
"\n",
"# 시각화 스타일 설정\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"# 0.0에서 1.0까지 0.05 간격으로 bin을 설정\n",
"bins = np.arange(0, 1.05, 0.05)\n",
"\n",
"# TF-IDF와 DistilBERT 점수 분포 계산\n",
"tfidf_correct_hist, _ = np.histogram(correct_scores_tfidf, bins=bins)\n",
"tfidf_incorrect_hist, _ = np.histogram(incorrect_scores_tfidf, bins=bins)\n",
"\n",
"bert_correct_hist, _ = np.histogram(correct_scores_bert, bins=bins)\n",
"bert_incorrect_hist, _ = np.histogram(incorrect_scores_bert, bins=bins)\n",
"\n",
"# 나비 차트 그리기\n",
"fig, ax = plt.subplots(figsize=(12, 7))\n",
"\n",
"bin_centers = 0.5 * (bins[1:] + bins[:-1])\n",
"\n",
"# 왼쪽: DistilBERT 점수 분포\n",
"ax.barh(bin_centers, -bert_correct_hist, height=0.05, color='green', alpha=0.7, label='DistilBERT Correct', edgecolor='black')\n",
"ax.barh(bin_centers, -bert_incorrect_hist, height=0.05, color='red', alpha=0.4, label='DistilBERT Incorrect', edgecolor='black')\n",
"\n",
"# 오른쪽: TF-IDF 점수 분포\n",
"ax.barh(bin_centers, tfidf_correct_hist, height=0.05, color='green', alpha=0.7, label='TF-IDF Correct', edgecolor='black', left=0)\n",
"ax.barh(bin_centers, tfidf_incorrect_hist, height=0.05, color='red', alpha=0.4, label='TF-IDF Incorrect', edgecolor='black', left=0)\n",
"\n",
"\n",
"ax.set_xlabel('Frequency', fontsize=18) # x축 폰트 크기를 18로 설정\n",
"ax.set_ylabel('Score', fontsize=18) # y축 폰트 크기를 18로 설정\n",
"\n",
"# x축과 y축의 눈금 폰트 크기 조정\n",
"ax.tick_params(axis='x', labelsize=16) # x축 눈금 폰트 크기를 16으로 설정\n",
"ax.tick_params(axis='y', labelsize=16) # y축 눈금 폰트 크기를 16으로 설정\n",
"\n",
"ax.set_yticks(np.arange(0, 1.1, 0.1))\n",
"# 기존의 x축 눈금을 설정한 부분을 수정\n",
"ax.set_xticks(np.arange(-10000, 10001, 2000)) # x축 단위를 2000으로 설정\n",
"\n",
"# 음수 눈금을 양수로 표시하도록 라벨을 변경\n",
"ax.set_xticklabels([str(abs(x)) for x in np.arange(-10000, 10001, 2000)])\n",
"\n",
"# 범례 폰트 크기를 키우고 위치 설정\n",
"# DistilBERT는 좌측 하단에, TF-IDF는 우측 하단에 범례 표시\n",
"legend_bert = ax.legend(['DistilBERT Correct', 'DistilBERT Incorrect'], fontsize=20, loc='lower left')\n",
"legend_tfidf = ax.legend(['TF-IDF Correct', 'TF-IDF Incorrect'], fontsize=20, loc='lower right')\n",
"\n",
"# 범례 두 개 모두 추가\n",
"ax.add_artist(legend_bert)\n",
"\n",
"# 레이아웃 조정 및 출력\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}