hipom_data_mapping/data_import/plot_count.ipynb

219 lines
140 KiB
Plaintext
Raw Normal View History

2024-08-26 19:51:11 +09:00
{
"cells": [
{
"cell_type": "code",
2024-09-25 08:52:30 +09:00
"execution_count": 10,
2024-08-26 19:51:11 +09:00
"metadata": {},
"outputs": [
2024-09-25 08:52:30 +09:00
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total SD: 62071\n",
"Total PD: 10530\n"
]
},
2024-08-26 19:51:11 +09:00
{
"data": {
2024-09-25 08:52:30 +09:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA28AAAIhCAYAAADZ6oJUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABYP0lEQVR4nO3deXgUVfr//U8n6ewLO2EVENkRUcBxgBGGTVQUQUdHERUXxnFUBFFRB4KjMgii87iNwiiouAE6bgjGCIjiBuIoCIiQsAQBIYRskHTS9fzBN/VLSCck6a6uruT9uq5cV6XqnDqn6ZtK7pzqul2GYRgCAAAAAIS0MLsnAAAAAAA4NZI3AAAAAHAAkjcAAAAAcACSNwAAAABwAJI3AAAAAHAAkjcAAAAAcACSNwAAAABwAJI3AAAAAHCACLsn4HRer1f79u1TQkKCXC6X3dMBAAAAYBPDMJSbm6uWLVsqLCzw62Qkb37at2+f2rRpY/c0AAAAAISIPXv2qHXr1gE/L8mbnxISEiRJ6enpatSokc2zQV3i8Xj08ccfa/jw4XK73XZPB3UM8QWrEFuwEvEFqwQqtnJyctSmTRszRwg0kjc/ld4qefsNByVvYVDHfv+p14M6Xp1wZordM6g2j8ej2NhYJSYm8gMKAUd8wSrEFqxEfMEqgY4tqz5OxQNLAAAAAMABSN4AAAAAwAFI3gAAAADAARyVvC1cuFAul+uUX5988kml59ixY4cmTpyo9u3bKzo6Wk2bNtWIESO0bNmyIL4SAAAAAKgZRz6wJCwsTE2bNq30eFRUlM/9y5cv1xVXXKGCggJJUmJiorKysvTxxx/r448/1g033KD//Oc/1GsDAAAAEHIctfJWqk2bNtq/f3+lXwMHDqzQJz09XX/6059UUFCg/v37a9u2bTp69KiOHj2q6dOnS5JeeuklzZkzJ9gvBwAAAABOyZErb7Uxffp05efnKzk5WR988IEaNGggSYqPj9fMmTO1f/9+vfDCC3rkkUd08803q2HDhjU6/8Klp6tx48YWzLwqKUEeDwAAAIBdHLnyVlP5+fnmZ9puvfVWM3Era9q0aZJOFNb773//G8TZAQAAAMCp1YuVt88//1zHjh2TJI0cOdJnm3bt2qlr167asmWL+fm3mrj+8h2S95Dfc62uel+g20HFtgEAAIBAcOTK22+//aZzzjlH8fHxiomJUYcOHTRu3DitXr3aZ/tNmzaZ2z169Kj0vKXHNm/eHND5AgAAAIC/HLnyVlBQoO+++04NGzZUfn6+0tPTlZ6ersWLF+uGG27QCy+8oIiI//fS9u3bJ0lq2LChYmJiKj1vq1atyrX3pbCwUIWFheb3OTk5kiR3RImkEn9eVo14vI7MuwPH47F7Bpbz/N9r9NSD14rgI75gFWILViK+YJVAxZbVsemo5K1ly5aaMWOGxowZo86dOysqKkolJSX6+uuvNWPGDH3yySd66aWXFBcXp6eeesrsl5ubK0mKjY2t8vylx0vb+zJr1izNnDmzwv4rJmSc8vyBtHxXr6CNFZJ2Lbd7BkGTmppq9xRQhxFfsAqxBSsRX7CKv7FVWpLMKi7DMAxLRwgSr9erMWPG6N1331VYWJi2bt2qM844Q5J0yy23aP78+WrVqpX27t1b6TkeeOABPfroo4qMjCy3ulaWr5W3Nm3a6LKh6yQlBfQ1VeXNOUuDNlZI6j7N7hlYzuPxKDU1VcOGDZPb7bZ7OqhjiC9YhdiClYgvWCVQsZWTk6MmTZro6NGjSkxMDOAMT3DUyltVwsLCNHfuXL377rvyer16//33NXnyZElSQkKCpFNnwqXHS9v7EhUV5bMIuKc4XPKG13b6NeYO8wZtrJBUjy7YbrebH1CwDPEFqxBbsBLxBav4G1tWx2Wd+uBUx44d1aRJE0nSzp07zf0tW7aUJB05csR86qQvmZmZ5doDAAAAQKioMytvVSn7hMlNmzapb9++PtuVPpWye/fuNR4j+EW6U4I4FgAAAAC71amVtx07dujQoRO11tq3b2/uHzBggPmUyRUrVvjsu2vXLm3ZskWSNHz4cItnCgAAAAA145iVN8Mw5HK5qjw+depUSSc+/3bxxRebx+Li4jR27Fi9+uqreu6553THHXcoKan8w0Vmz54t6cTn3UaPHl3j+QW7SHdV6n0B77rCGyapl7R5llTfP+OIwCO+YBViC1YivnCyM1PsnkFQOWblbdeuXerXr5+ef/557dy5U6UPyfR6vfrqq680cuRIvfPOO5KkiRMnqnPnzuX6P/TQQ4qLi9Ovv/6qUaNGafv27ZKk/Px8PfTQQ/r3v/8tSXrwwQfVsGHDIL4yAAAAADg1x6y8SdK3336rb7/9VtKJpz4mJCQoNze33KP7b7jhBv1//9//V6Fv+/bt9dZbb+mKK67Q2rVr1alTJyUlJSkvL08lJSVm39LVOwAAAAAIJY5J3po3b66nnnpKX375pb7//nv99ttvOnLkiKKjo9W+fXv9/ve/14QJE9S/f/9Kz3HhhRfqhx9+0OzZs5Wamqpff/1VDRs2VO/evTVx4kSNHTs2iK8IAAAAAKrPMclbTEyM/va3v+lvf/ubX+c5/fTT9cILLwRoVgAAAAAQHI75zBsAAAAA1GckbwAAAADgAI65bTLUBb9Id1VS7J4AAsHjkXYtl7pPk9xuu2eDuob4glWILViJ+EI9R/IWIKFQ5436bnUMtWyco57VmAEAAPbgtkkAAAAAcACSNwAAAABwAJI3AAAAAHAAkjcAAAAAcACSNwAAAABwAJI3AAAAAHAASgUESGjUeUuxeXwEFLVsAAAAUAYrbwAAAADgAKy8BUgoFOk+WUgU7aZ4MQAAABAQrLwBAAAAgAOQvAEAAACAA5C8AQAAAIADkLwBAAAAgAOQvAEAAACAA5C8AQAAAIADUCogQEKjSPfJUuyeAAAAAIAAYeUNAAAAAByAlbcACXaR7pAowB0oFPIGAAAATomVNwAAAABwAJI3AAAAAHAAkjcAAAAAcACSNwAAAABwAJI3AAAAAHAAkjcAAAAAcABKBQRI8It0pwRxLAAAAAB2Y+UNAAAAAByAlbcACXaRbl/qVOFuSN4wSb2kzbOkMK/ds6m7KBIPAAAcgpU3AAAAAHAAkjcAAAAAcACSNwAAAABwAJI3AAAAAHAAkjcAAAAAcACSNwAAAABwAEoFBEjwi3T7kmLz+Agoj0fatVzqPk1yu+2eDQAAAGzGyhsAAAAAOAArbwESCkW6S1Gsu46gSDesVJP4opA5AAAhgZU3AAAAAHAAkjcAAAAAcACSNwAAAABwAJI3AAAAAHAAkjcAAAAAcACSNwAAAABwAEoFBEhoFOkulWL3BBAIFOmGlYgvAAAch5U3AAAAAHAAVt4CxM4i3RTlrqMo0g0rEV+oLYq2A4BtWHkDAAAAAAcgeQMAAAAAByB5AwAAAAAHIHkDAAAAAAcgeQMAAAAAByB5AwAAAAAHoFRAgNhbpDvFpnFhKYoow0rEFwAAjkPyFiB21HmjvlsdF6p1uKjxBAAAYAtumwQAAAAAByB5AwAAAAAHIHkDAAAAAAcgeQMAAAAAByB5AwAAAAAHIHkDAAAAAAcgeQMAAAAAB6DOW4DYU6Q7JcjjIagoogwAAIAySN4CxOoi3dUqyE3xZAAAAKDO4rZJAAAAAHAAkjcAAAAAcACSNwAAAABwAMcnb//85z/lcrnMr6rk5uYqJSVFPXv2VHx8vJKSktS3b189/vjjKioqCtKMAQAAAKDmHP3Akm3btmnmzJnVartr1y4NGjRIGRkZkqTY2FgVFhZq/fr1Wr9+vRYvXqy0tDQ1bNjQwhkDAAAAQO04duXN6/VqwoQJOn78uM4777wq2xYXF2vUqFHKyMhQixYtlJqaqvz8fBUUFOiNN95QQkKCNm7cqHHjxgVp9gAAAABQM45deXvqqae0bt06XXPNNerYsaO+/PLLStsuWrRIP/74oyRp2bJlZrIXFhamK6+8Ul6vV1dffbWWL1+utLQ0DRkypMbzsb7OW4qF5wYAAAAQ6hy58paenq4HHnhAjRs31hNPPHH
2024-08-26 19:51:11 +09:00
"text/plain": [
2024-09-25 08:52:30 +09:00
"<Figure size 1000x600 with 1 Axes>"
2024-08-26 19:51:11 +09:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# 전체 글꼴 크기 설정\n",
"plt.rcParams.update({'font.size': 18})\n",
"\n",
"# CSV 파일 읽기\n",
"df = pd.read_csv('raw_data.csv')\n",
"\n",
"# ships_idx 별 전체 갯수 계산\n",
"total_counts = df['ships_idx'].value_counts().sort_index()\n",
"\n",
"# ships_idx 별 MDM=True 인 갯수 계산\n",
"mdm_true_counts = df[df['MDM'] == True]['ships_idx'].value_counts().sort_index()\n",
"\n",
"# 데이터프레임으로 합치기\n",
"summary_df = pd.DataFrame({\n",
" 'SD': total_counts,\n",
" 'PD': mdm_true_counts\n",
"}).fillna(0) # NaN 값을 0으로 대체\n",
"\n",
2024-09-25 08:52:30 +09:00
"# SD와 PD의 총 갯수 계산\n",
"total_SD = summary_df['SD'].sum()\n",
"total_PD = summary_df['PD'].sum()\n",
"\n",
"# 총 갯수 출력\n",
"print(f\"Total SD: {total_SD}\")\n",
"print(f\"Total PD: {total_PD}\")\n",
"\n",
2024-08-26 19:51:11 +09:00
"# 시각화\n",
2024-09-25 08:52:30 +09:00
"fig, ax = plt.subplots(figsize=(10, 6))\n",
2024-08-26 19:51:11 +09:00
"\n",
2024-09-25 08:52:30 +09:00
"# Total Counts 먼저 그리기 (굵은 막대로 설정)\n",
"summary_df['SD'].plot(kind='barh', ax=ax, color='orange', alpha=0.5, label='SD', width=0.8) # 막대 폭을 넓게 설정\n",
2024-08-26 19:51:11 +09:00
"\n",
2024-09-25 08:52:30 +09:00
"# MDM=True Counts를 그 위에 겹쳐서 그리기 (굵은 막대로 설정)\n",
"summary_df['PD'].plot(kind='barh', ax=ax, color='blue', alpha=0.7, label='PD', width=0.8) # 막대 폭을 넓게 설정\n",
2024-08-26 19:51:11 +09:00
"\n",
2024-09-25 08:52:30 +09:00
"# y축 라벨을 10 단위로 설정\n",
2024-08-26 19:51:11 +09:00
"y_labels = ax.get_yticks()\n",
2024-09-25 08:52:30 +09:00
"ax.set_yticks(np.arange(min(y_labels), max(y_labels) + 1, 10))\n",
"ax.set_yticklabels([int(label) for label in np.arange(min(y_labels), max(y_labels) + 1, 10)])\n",
2024-08-26 19:51:11 +09:00
"\n",
"# 그리드 추가\n",
"ax.grid(True)\n",
"\n",
"# 범례와 제목 설정\n",
"plt.legend(prop={'size': 18}) # 레전드 글꼴 크기 설정\n",
"plt.xlabel('Counts')\n",
"plt.ylabel('Ships')\n",
2024-09-25 08:52:30 +09:00
"\n",
"# 그래프 출력\n",
2024-08-26 19:51:11 +09:00
"plt.show()\n"
]
2024-09-25 08:52:30 +09:00
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tag Description의 평균 글자수: 27.38\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5EAAAJACAYAAAAUx+KBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAACmTElEQVR4nOzdd3xUVf7/8fckpDdCDT0BiVQVRF0EFJAqoKwIoiBIFxRFBVdWpNmW7u6qIEUERKmKLFKCCArYAAuCCAhJiNRISSdt7u8PfpnvhEwmk+Smkdfz8ZjHYzLnfM49c3PmZj45955rMQzDEAAAAAAALnAr6Q4AAAAAAMoOkkgAAAAAgMtIIgEAAAAALiOJBAAAAAC4jCQSAAAAAOAykkgAAAAAgMtIIgEAAAAALiOJBAAAAAC4jCQSAAAAAOAykkigHLBYLLJYLGrfvn1Jd6VAnnjiCdt7iIqKylG+a9cuW/nUqVOLvX9muVHeR3GJiYnRs88+qyZNmsjf39+272677baS7hrKqdDQUFksFoWGhpZ0V3KYOnWq7TOya9euku4OSrm8/u4CFUq6AwCusVgsDl/39PRUYGCggoKCVK9ePbVs2VJ33XWXevToIR8fn2LupWNZCU9oaKieeOKJEu1LaXLlyhW99dZbkqTbbrtNvXv3LtH+3Eh+//133X333bp8+XKB4j/44AMNGTLElL5ERkaWyqTBmbJ8vME1P//8szZs2CBJ6t27d7n/54n9mDYMowR7UjoxXmA2kkiglEtLS9Nff/2lv/76SydOnNCXX34pSapYsaIGDx6sadOmKSgoqET7OG3aNEnSvffeSxJp58qVK7Z9M3jwYJJIE7344ou2BLJnz5568MEHVaVKFUkq8c9DWVYWjje45ueff7YdX0JDQ0kK4BTjBWYjiQRKoU8//dT23DAMxcXF6fLly/r555/19ddfKyoqSleuXNG///1vrV+/Xh9//LHatm2ba3tl/b+yH3zwgT744IOS7kaRa9++fZn/XRWH9PR0bd++XZLUuHFjbdy4MdeZtdx07Ngx2+fsev/5z3+0c+dOSdLYsWPVsWPHXOtWq1YtX9subcw+3pRnpfm0v6lTp3KaPADTkEQCpZCzGSvDMLRlyxaNGzdOx48f159//qmePXtq7969atq0afF1Eighf/31l65evSpJuvXWW/OdQEpS3bp1Vbdu3VzLs077kqSWLVve0LPIHG8AAPnFwjpAGWOxWHT//fdr//79ttmAuLg49e3bV1artYR7BxS91NRU23MvL68S7MmNj+MNAMARkkigjAoMDNSaNWtUsWJFSdKRI0e0evVqh3VdWZ31zJkzmjx5slq3bq1KlSrJw8NDwcHBatiwodq1a6fnn39eu3fvdthulq+++sr2mv3DfiVARyuQHj16VOPGjVPjxo0VGBgoi8WS7fTVgqwS9+uvv2rkyJFq0KCBfHx8VLVqVXXq1Ekff/yx07gPPvjAtq28TqHNrW5UVJQsFovCwsJsry1btszhvrF/P/lZnTU5OVnz5s1Thw4dFBISIi8vL1WrVk1t27bVm2++qbi4OKfxjlZq/O677zRgwADVq1fP1l7Pnj21detWp23l16VLl/Tqq6+qdevWqlq1qjw9PVWjRg116tRJ//3vf22zjNfLGgd57dfiOqUwPj5eK1eu1LBhw9SiRQtVrFhRHh4eqlSpklq2bKkXXnhBJ06ccLm9EydOaMyYMbrpppvk4+OjatWq6Z577tHChQuVmZkpqeRWWs7P8SZLWlqalixZogceeEB16tSRt7e3KlasqFtuuUUvvPCCS7+nghyXrmcYhjZs2KDHH39cDRs2VGBgYLYx98Ybbzjsi6PPyI4dO/Too48qLCxM3t7eOcZbXquzOmpz8+bNevDBB1W7dm15eXmpdu3aevTRR/Xtt986bCPruGO/MNSQIUNyfA6u70N+VmeNiYnRSy+9pJYtW6pSpUry8vJSrVq11KtXL33wwQe28Zib9u3b5/j7sGrVKnXu3Nl2vKpXr56eeOIJ/f77707bKm5Wq1Vr1qzRI488orCwMPn6+iogIECNGjXS6NGj9euvvzqNL4pja1pamt566y397W9/U3BwsPz9/dW4cWNNmDBBp06dkpT738mCjhdHtm3bpt69e9vGas2aNdW3b199//33ecaa8VlGKWMAKBUk2R758eKLL9riOnXq5LTte++912H5pk2bDH9//2x9cPQICgrKtc/OHjt37rTF7Ny50/b6lClTjGXLlhk+Pj45YpYuXWqLGTx4sO31yMjIHP2/vs3ly5cbXl5eufanR48eRkpKisN9sXTpUod9yE/dyMhIl/eN/fu5/n3k5ttvvzVq1qzptN3KlSsb27Zty7WNKVOmZPv9vPbaa4abm1uu7U2ePNnpvnDVhg0bjIoVKzrte926dY0ff/wxR6z9OHB1nxaU/bYcjYPU1FSnYyzrUaFCBePtt9/Oc3sfffSRw8+B/Wf3ypUreX6WXVWUxxvDMIx9+/YZYWFhTveNp6ensWDBglzbKOhxyd4ff/xhtGzZMs82QkNDc8Taf0a+/PJL46mnnspzvNWrV8+QZNSrV89hf67/3I0ZMybXPrm5uRlTp07N0Yb9ccfZ4/o+XL/t3CxYsMDpWJRkNG/e3Onn7N5777XVTUlJMXr37p1rW15eXsbmzZtzbctVBR3T9v744w/jtttuc/re3dzcjFdeeSXXNsw+tv75559G06ZNc40PDg42duzYkevfyYKOF/v2Tpw4YYwePdrpPlm8eHGu78GMzzJKH66JBMq4xx57TDNnzpQkffPNN0pPT5eHh4fL8adPn1b//v2VmJgoSerRo4c6d+6smjVrymq16sKFC/rll1+0ffv2HLNbWQty/P3vf5ckNW3aVK+99lqObTRr1szhtvfu3avXX39d7u7uGjZsmNq0aSNvb28dPXpUISEhLr8He/v27dMbb7whSRo6dKjuueceubu7a9++fVqyZImSkpL0+eefa+DAgVq3bl2BtpGXatWq6dNPP9WFCxc0atQoSVKHDh30zDPPOKybHz/99JM6duyolJQUSVKLFi302GOPqW7dujp37pzWrFmjvXv36uLFi+rZs6ciIiLynLVauHChPv74Y9WqVUtPPPGEmjZtqrS0NG3dulWrV6+WYRiaPn267r33XqcLzORl8+bN6tOnj20W45577tHDDz+s6tWrKzo6WitWrNCvv/6qU6dO6d5779UPP/ygRo0a2eKfeeYZ9e7dO8/9WhwL3VitVqWmpqpmzZrq3LmzbrnlFlWvXl1ubm6KiYnRN998o40bNyojI0NPP/20atasafucXG/Hjh16/PHHbfvl3nvv1cMPP6xq1arp1KlTWrFihb766iuNGDGiyN9XXlw53nz77bfq1KmTkpOTZbFY1LVrV3Xp0kW1atVSSkqKvv32W61YsULJycl68skn5eXllWNV58Icl7IcO3ZMrVu31qVLlyRJNWrU0COPPKJbb71Vfn5+unDhgvbv369NmzbluaDVrFmztGXLFoWEhOiJJ55Qs2bNlJGRoR9++KHAp1T/+9//1oYNG1SlShUNHz5ct9xyi5KTk7V161atX79eVqtVU6dOVeXKlfX000/b4rIWhfryyy/13//+V5LjxZ98fX3z3af33ntPTz75pO3nXr16qUePHqpYsaKOHTumpUuXKjIyUr/++qvatm2rn376SVWrVnXa5tChQ7Vhwwbdfvvt6t+/v+rWrau//vpLK1eu1DfffKPU1FQNHDhQR48eta2wXBJOnDihv/3tb/rrr78kSW3btlXPnj1Vr149ZWZm6sCBA/rggw90+fJlvfrqq3Jzc8vzjJHCHltTUlLUuXNnHTlyRJJUs2ZNDR06VE2bNlVSUpK++OILrVmzRn379lWLFi0c9sGM8TJp0iR9/PHHCg8P16BBg3TTTTcpISFBn3zyibZs2SKr1aoxY8aoTZs22Y7ZkjmfZZRSJZzEAvj/VMD/omZkZBh+fn622J9//jnXth3NXsyaNctWPmPGjFy3Y7Vaja+//tpp312ZHbGfbZNkhIS
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tag Description의 평균 토큰 수: 5.29\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5EAAAJACAYAAAAUx+KBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAACQJklEQVR4nOzdd3xUVf7/8fckpHdKCAQkQUGqCrq6QABBaSoaRRARRXpR1KUpuwgBsdB010ak2lCpoiJVigiiC6KiCBZIIgsCgZAOqff3B7/c74RMkhuYlIHX8/GYx2Nmzvmce+ae3Ek+OfeeazMMwxAAAAAAABa4VXYHAAAAAACugyQSAAAAAGAZSSQAAAAAwDKSSAAAAACAZSSRAAAAAADLSCIBAAAAAJaRRAIAAAAALCOJBAAAAABYRhIJAAAAALCMJBKAJMlms8lms+nWW2+t7K5clEcffdT8DPHx8UXKt23bZpbHxMRUeP+c5XL5HBXlyJEjevLJJ9WsWTP5+/ub++6GG26o7K7hChURESGbzaaIiIjK7koRMTEx5jGybdu2yu7OZeXWW2819y1wOahW2R0AYF1xv3w8PT0VGBiooKAgNWjQQK1bt9Ytt9yiO++8Uz4+PhXcS8cKEp6IiAg9+uijldqXqiQ5OVn//ve/JUk33HCDoqOjK7U/l5ODBw+qbdu2OnPmzEXFv/322xo4cKBT+hIXF1clk4aSuPL3Dc774YcftHr1aklSdHT0FffPk/j4eEVGRjqlrcWLF/O7C7BDEglcBrKzs3Xq1CmdOnVKhw4d0pYtWyRJwcHBGjBggKZOnaqgoKBK7ePUqVMlSR07duQXsZ3k5GRz3wwYMIAk0okmTJhgJpB33XWX7rnnHtWsWVOSKv14cGWu8H2D83744Qfz+yUiIuKKSyIBlB+SSMBFffzxx+ZzwzCUkpKiM2fO6IcfftD27dsVHx+v5ORk/ec//9HKlSv14YcfKioqqtj2DMOoiG6Xm7fffltvv/12ZXej3N16660uP1YVIScnR5s2bZIkNW3aVJ9++mmZTyPr3LlzoePsQq+++qq2bt0qSRo9erQ6d+5cbN3Q0NAybbuqcfb3zZXM0en2VUVMTMxldZp8aGhoicfwli1b9Nprr0mSOnXqpCeeeKLYuq1bt3Z6/wBXRhIJuKiSZqwMw9C6dev01FNP6ffff9f//vc/3XXXXdq5c6eaN29ecZ0EKsmpU6d07tw5SdL1119/UdchXXXVVbrqqquKLS84TVA6/wfm5TyLzPcNXJGvr2+JP7vJycnm86uuuuqyPoYBZ2NhHeAyZLPZdMcdd2jPnj3mbEBKSop69+6t/Pz8Su4dUP6ysrLM515eXpXYk8sf3zcAcOUhiQQuY4GBgVq2bJmCg4MlSQcOHNDSpUsd1rWyOuuxY8c0efJktWnTRtWrV5eHh4dCQkLUqFEjtW/fXmPGjNFXX33lsN0CX375pfme/cN+JUBHK5D++uuveuqpp9S0aVMFBgbKZrMVOn21tNVZHfnpp580bNgwXX311fLx8VGtWrV0++2368MPPywx7u233za3VdoptMXVjY+Pl81mK7TowzvvvONw39h/nrKszpqZmalXXnlFnTp1UlhYmLy8vBQaGqqoqCi9+OKLSklJKTHe0UqN33zzjR566CE1aNDAbO+uu+7S+vXrS2yrrJKSkvTcc8+pTZs2qlWrljw9PVWnTh3dfvvteu2118xZxgsV/ByUtl8r6pTC1NRULVmyRIMHD1arVq0UHBwsDw8PVa9eXa1bt9bYsWN16NAhy+0dOnRIo0aN0jXXXCMfHx+FhoaqQ4cOmjdvnvLy8iRV3krLZfm+KZCdna2FCxfq7rvvVv369eXt7a3g4GBdd911Gjt2rKVxupjvpQsZhqHVq1fr4YcfVqNGjRQYGFjoZ+6FF15w2BdHx8jmzZv14IMPKjIyUt7e3kV+3kpbndVRm2vXrtU999yjevXqycvLS/Xq1dODDz6oXbt2OWyj4HvHfmGogQMHFjkOLuxDWVZnPXLkiJ555hm1bt1a1atXl5eXl8LDw9WzZ0+9/fbb5s9jcRytVvrRRx+pS5cu5vdVgwYN9Oijj+rgwYMltlURLvXzWmU/Bi1atNDRo0eL1Dl16pSef/55tW/fXmFhYfL09FStWrXUvn17zZw5U+np6SVu48KfwdzcXM2bN09RUVGqWbOmfHx81KhRIz3++OP63//+V2qf9+7dqxEjRqhly5YKDAyUh4eHQkND1axZM3Xv3l3PPfecfv/994vaH6jCDAAuQ5L5KIsJEyaYcbfffnuJbXfs2NFh+Zo1awx/f/9CfXD0CAoKKrbPJT22bt1qxmzdutV8f8qUKcY777xj+Pj4FIlZvHixGTNgwADz/bi4uCL9v7DNd9991/Dy8iq2P3feeadx9uxZh/ti8eLFDvtQlrpxcXGW943957nwcxRn165dRt26dUtst0aNGsaGDRuKbWPKlCmFxmf69OmGm5tbse1Nnjy5xH1h1erVq43g4OAS+37VVVcZe/fuLRJr/3NgdZ9eLPttOfo5yMrKKvFnrOBRrVo14/XXXy91ex988IHD48D+2E1OTi71WLaqPL9vDMMwdu/ebURGRpa4bzw9PY3Y2Nhi27jY7yV7f/zxh9G6detS24iIiCgSa3+MbNmyxXjsscdK/Xlr0KCBIclo0KCBw/5ceNyNGjWq2D65ubkZMTExRdqw/94p6XFhHy7cdnFiY2NL/FmUZLRs2bLE46xjx45m3bNnzxrR0dHFtuXl5WWsXbu22LYulv1+GjBgQIV9Xkfy8vKMESNGmHXatm1rJCUlOexzQEBAiX2pXbu28fXXXxfbF/ufwcTERKNdu3bFthUSEmLs2bOn2LamTJli2Gy2Un/W7rnnnmLbgGvimkjgCtCvXz/NnDlTkvT1118rJydHHh4eluOPHj2qvn37mv/dvPPOO9WlSxfVrVtX+fn5OnnypH788Udt2rSpyOxWwaIG9957rySpefPmmj59epFttGjRwuG2d+7cqeeff17u7u4aPHiw2rVrJ29vb/36668KCwuz/Bns7d69Wy+88IIkadCgQerQoYPc3d21e/duLVy4UBkZGfr888/Vv39/rVix4qK2UZqCBR9Onjyp4cOHSyp+YYeyLsry/fffq3Pnzjp79qwkqVWrVurXr5+uuuoqHT9+XMuWLdPOnTt1+vRp3XXXXdq4cWOps1bz5s3Thx9+qPDwcD366KNq3ry5srOztX79ei1dulSGYWjatGnq2LFjiQvMlGbt2rXq1auX+V/9Dh066P7771ft2rWVkJCg9957Tz/99JP+/PNPdezYUf/973/VpEkTM/6JJ55QdHR0qfu1Iha6yc/PV1ZWlurWrasuXbrouuuuU+3ateXm5qYjR47o66+/1qeffqrc3Fw9/vjjqlu3rnmcXGjz5s16+OGHzf3SsWNH3X///QoNDdWff/6p9957T19++aWGDh1a7p+rNFa+b3bt2qXbb79dmZmZstls6tatm7p27arw8HCdPXtWu3bt0nvvvafMzEyNGDFCXl5eRVZ1vpTvpQK//fab2rRpo6SkJElSnTp19MADD+j666+Xn5+fTp48qT179mjNmjWlLmg1a9YsrVu3TmFhYXr00UfVokUL5ebm6r///e9Fn1L9n//8R6tXr1bNmjU1ZMgQXXfddcrMzNT69eu1cuVK5efnKyYmRjVq1NDjjz9uxhUsCmW/cIyjxZ98fX3L3Ke33npLI0aMMF/37NlTd955p4KDg/Xbb79p8eLFiouL008//aSoqCh9//33qlWrVoltDho0SKtXr9aNN96ovn376qqrrtKpU6e0ZMkSff3118rKylL//v3166+/missV5Ty+LwXysrK0kMPPaSVK1dKku644w4tX768yPj85z//0VNPPSXp/Njdf//9atu2rWrUqKFTp05p/fr1+vTTT3XixAndfvvt2r17t5o1a1bsdnNzc9WrVy/t3LlTnTp1UnR0tOrUqaOjR49qwYIF2r9/v86cOaO+fftq//798vT0LBT/ySefmKv/+vj46ME
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"전체 토큰 수: 328251\n",
"고유 토큰 수: 8283\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# SD와 PD의 총 갯수 계산\n",
"total_SD = summary_df['SD'].sum() # SD의 총 갯수\n",
"total_PD = summary_df['PD'].sum() # PD의 총 갯수\n",
"\n",
"# tag_description의 글자수 계산\n",
"df['tag_description_length'] = df['tag_description'].astype(str).apply(len)\n",
"\n",
"# tag_description의 평균 글자수 계산\n",
"mean_tag_description_length = df['tag_description_length'].mean()\n",
"\n",
"# 결과 출력\n",
"print(f\"Tag Description의 평균 글자수: {mean_tag_description_length:.2f}\")\n",
"\n",
"# 글자수 분포를 히스토그램으로 시각화\n",
"plt.figure(figsize=(10, 6))\n",
"plt.hist(df['tag_description_length'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)\n",
"plt.title('Distribution of Tag Description Lengths')\n",
"plt.xlabel('Tag Description Length (characters)')\n",
"plt.ylabel('Frequency')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# tag_description을 ' '로 split한 후 토큰 수 계산\n",
"df['tag_description_tokens'] = df['tag_description'].astype(str).apply(lambda x: len(x.split(' ')))\n",
"\n",
"# tag_description의 평균 토큰 수 계산\n",
"mean_tag_description_tokens = df['tag_description_tokens'].mean()\n",
"\n",
"# 결과 출력\n",
"print(f\"Tag Description의 평균 토큰 수: {mean_tag_description_tokens:.2f}\")\n",
"\n",
"# 토큰 수 분포를 히스토그램으로 시각화\n",
"plt.figure(figsize=(10, 6))\n",
"plt.hist(df['tag_description_tokens'], bins=30, color='lightgreen', edgecolor='black', alpha=0.7)\n",
"plt.title('Distribution of Tag Description Tokens')\n",
"plt.xlabel('Number of Tokens')\n",
"plt.ylabel('Frequency')\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# 전체 텍스트에서 모든 토큰 수와 고유 토큰 수 계산\n",
"all_tokens = df['tag_description'].astype(str).apply(lambda x: x.split(' ')).sum() # 전체 토큰 리스트\n",
"unique_tokens = set(all_tokens) # 고유 토큰 집합\n",
"\n",
"# 전체 토큰 수와 고유 토큰 수 계산\n",
"total_token_count = len(all_tokens)\n",
"unique_token_count = len(unique_tokens)\n",
"\n",
"# 결과 출력\n",
"print(f\"전체 토큰 수: {total_token_count}\")\n",
"print(f\"고유 토큰 수: {unique_token_count}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2024-08-26 19:51:11 +09:00
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}