hipom_data_mapping/data_import/plot_class_token.ipynb

177 lines
219 KiB
Plaintext
Raw Normal View History

2024-09-25 08:52:30 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Concatenated data saved to output/thing_property_grouped.csv\n",
"Total number of unique thing_property combinations: 691\n"
]
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"import re\n",
"\n",
"# CSV 파일 읽기\n",
"df = pd.read_csv('raw_data.csv')\n",
"\n",
"# MDM이 True인 데이터만 필터링\n",
"mdm_true_df = df[df['MDM'] == True]\n",
"\n",
"# 'thing'과 'property'로 그룹화하여 'tag_description'을 이어붙이기 (NaN을 빈 문자열로 처리)\n",
"tag_description_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_description'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()\n",
"\n",
"# 'thing'과 'property'로 그룹화하여 'tag_name'을 이어붙이기 (NaN을 빈 문자열로 처리)\n",
"tag_name_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_name'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()\n",
"\n",
"# 'thing'과 'property'의 매핑 개수 계산\n",
"mapping_count = mdm_true_df.groupby(['thing', 'property']).size().reset_index(name='mapping_count')\n",
"\n",
"# 세 개의 데이터프레임 병합: mapping_count, tag_description_concatenated, tag_name_concatenated\n",
"thing_property_grouped = pd.merge(mapping_count, tag_description_concatenated, on=['thing', 'property'])\n",
"thing_property_grouped = pd.merge(thing_property_grouped, tag_name_concatenated, on=['thing', 'property'])\n",
"\n",
"# 'tag_description'에서 공백으로 분리된 토큰 수 계산\n",
"thing_property_grouped['td_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(re.findall(r'\\S+', x)))\n",
"\n",
"# 'tag_description'에서 고유한 토큰 수 계산 (unique_token_count)\n",
"thing_property_grouped['unique_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(set(re.findall(r'\\S+', x))))\n",
"\n",
"# 'thing'과 'property'에서 숫자를 '#'으로 대체하여 pattern 생성\n",
"thing_property_grouped['pattern'] = thing_property_grouped['thing'].str.replace(r'\\d', '#', regex=True) + \" \" + thing_property_grouped['property'].str.replace(r'\\d', '#', regex=True)\n",
"\n",
"# 고유한 thing_property 조합의 총 개수 계산\n",
"total_thing_property_count = thing_property_grouped.shape[0]\n",
"\n",
"# 저장 경로 지정\n",
"output_path = 'output/thing_property_grouped.csv'\n",
"\n",
"# 디렉터리 생성 (존재하지 않으면)\n",
"output_dir = os.path.dirname(output_path)\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"# 결과를 CSV 파일로 저장\n",
"thing_property_grouped.to_csv(output_path, index=False, encoding='utf-8-sig')\n",
"\n",
"# 결과 출력\n",
"print(f\"Concatenated data saved to {output_path}\")\n",
"print(f\"Total number of unique thing_property combinations: {total_thing_property_count}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKYAAAMVCAYAAABN5/yzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1hT59sH8O9Jwt4iQwEFrThq3ataFffee1drHbW2rtpad2u1jtrW1vFz4bauDuuouKhatXXvLSAgUyGEMLLO+wfvOSaQhAAJyYH7c11c4pnPc3LncHLnGQzLsiwIIYQQQgghhBBCCCllImsXgBBCCCGEEEIIIYSUT5SYIoQQQgghhBBCCCFWQYkpQgghhBBCCCGEEGIVlJgihBBCCCGEEEIIIVZBiSlCCCGEEEIIIYQQYhWUmCKEEEIIIYQQQgghVkGJKUIIIYQQQgghhBBiFZSYIoQQQgghhBBCCCFWQYkpQgghhBBCCCGEEGIVlJgihBBCCCGEEEIIIVZRphJTWVlZOH78OJYsWYL+/fujatWqYBgGDMNg0aJFJh8nMTER8+fPR+PGjVGhQgU4OTmhatWq6Nq1K7799lsolUqD+yYlJWHmzJmoWbMmnJycUKFCBbRu3RqbN28Gy7JmqCUhhBBCCCGEEEJI2cCwZShbEhkZiXbt2uldt3DhQpOSU/v27cOECROQkZEBAHB0dIS9vT3/fwBIS0uDp6dngX2vXbuGLl264NWrVwAAV1dX5OTkQKVSAQC6dOmCw4cPw97evog1I4QQQgghhBBCCCl7ylSLKQDw8vJChw4d8Nlnn2Hv3r3w9/c3ed8DBw5g+PDhyMjIwIQJE3Dv3j1kZ2dDKpUiIyMD586dw/Tp02FnZ1dgX6lUip49e+LVq1eoVasWrly5AplMBrlcjp9//hl2dnY4ceIEpk2bZsbaEkIIIYQQQgghhAhXmWoxpVarIRaLdZYFBwcjJiam0BZTCQkJePvtt5GWlobvvvsOM2bMKNK558+fjyVLlsDJyQn37t1DSEiIzvply5bhyy+/hFgsxv379xEaGlqk4xNCCCGEEEIIIYSUNRJrF8Cc8ielimLNmjVIS0tDw4YNMX369CLvv2PHDgDA0KFDCySlAGDq1KlYunQpMjMzsXv3bixevNjkY6tUKty4cQN+fn4QicpcIzdCCCGEEEIIIcSmaDQaJCUloWHDhpBIylTqxObQ1f1/XGJp5MiRYBimSPs+evQIL168AAB069ZN7zaurq5o3bo1jh8/joiIiCIlpm7cuIFmzZoVqUyEEEIIIYQQQggpmf/++w9Nmza1djHKNEpMAYiKisLLly8BAI0bN8adO3ewbNkynD17Fq9fv4aPjw9atWqFTz75BK1atSqw/927d/nf69ata/A8devWxfHjx3H//v0ilc/Pzw9A3huiUqVKRdq3qFQqFW7fvo169epRVpgIFsUxETqKYSJ0FMNE6CiGidBRDJdcQkICmjVrxn8eJ5ZDEQrg8ePH/O///PMPFi9eDIVCAScnJzg6OiI+Ph779+/HgQMHsHjxYsyfP19nfy6pBQABAQEGz8Oty8jIQGZmJlxdXfVul5ubi9zcXP7/crkcAODj48MP5i4SiSASiaDRaKDRaPhtueVqtRraw4cZWi4Wi8EwDD9zoEajgUgkQkBAAL+9Nq67ZP7lEokELMvqLGcYBmKxuEAZDS23VJ0KKzvVqezViWVZSCSSApMfCLlOZfF1ojoZrhN3fD8/P51WvEKuU1l8nahOhuvEnTf/MARCrlNZfJ2oTobrpC+GhV6nsvg6UZ0M14n719/fX+f/Qq5Tab9O3L40nI7lUWIKQFpaGv/7/PnzUaVKFWzatAnt27eHSCTC/fv3MWXKFERGRmLBggV4++230b9/f34fmUzG/+7s7GzwPNrrZDKZwcTUsmXL9Hb1u337NhITEwHkJamqV6+OqKgopKSk8NsEBgYiMDAQjx8/hlQq5ZdXq1YNvr6+uHv3LrKzs/nltWrVgqenJ27cuKHzJqxYsSLs7e1x9epVnTI0adIECoUCt2/f5peJxWI0bdoUUqkUDx8+5Jc7OTmhfv36SE1NxfPnz/nlHh4eqF27Nl6+fIm4uDh+uaXrVK9ePapTOatTcnJymatTWXydqE6G6/TgwYMyV6ey+DpRnfTXqVKlSmWuTmXxdaI6Ga6Tg4MDrl+/XqbqVBZfJ6qT4TrJ5XKdGC4LdSrN1yk1NRWkdJSpWfn0MWVWvj179mDEiBEA8jK3V69eRaNGjXS2yczMRI0aNZCYmIh69erh1q1b/LqlS5di7ty5AAClUmmwqeSmTZswYcIEAHmtrAx1y8vfYio+Ph516tRBVFQUAgMDAVi2xdSTJ09Qs2ZNiMXicpsdpzoJu04ajQbPnj3DW2+9pdPaRMh1KouvE9XJcJ1YlsXTp09RvXp1nW/phFynsvg6UZ0M1wnIa5H+1ltvUYspqpMg6wTkjSOrHcNCr1NZfJ2oTobrxLIsHj58iBo1alCLqWLWKS4uDiEhIYiNjeU/hxPLoBZTANzc3PjfO3ToUCApBeQNXj5lyhTMnz8ft2/fRlJSEt/XVHv/rKwsuLu76z1PVlaW3nPm5+DgAAcHB/7/GRkZAPLe6PmTXtwbMT/ujWXqcu64KpWKbwHGMIzBJJu+5Ya2N1TGoi4vbp1KspzqJMw6qVQqSKVSs9XVFupUWBmpTmWrTlwMi0QivccXYp04Zel14lCdCpZRpVIhIyPDYAwLsU7FWU51Em6djMWwUOtkrIxUp7JXJ+5znb4YFmqdgNJ9nWhsrtJDnSWhOy5U7dq1DW5Xp04d/veYmBj+98qVK/O/x8fHG9yfW+fu7m6wGx8hhBBCCCGEEEJIeUGJKeQlnAxlWLVpNw3U7h6kPROf9gx9+XHrtBNchBBCCCGEEEIIIeUVJaYAODo6ok2bNgCABw8eGNzu/v37APKSUsHBwfzy0NBQVKlSBQDw119/6d1XLpfj/PnzAIDOnTubo9gWIRKJUK1aNb3NJgkRCopjInQUw0ToKIaJ0FEME6GjGCZCQlH6/8aOHQsAOH36tM7MBZzMzEysW7cOANC8eXP4+Pjw6xiGwejRowEAv/zyC6Kjowvsv3btWmRmZkIsFvMDrdsikUgEX19fuoERQaM4JkJHMUyEjmKYCB3FMBE6imEiJGUuStPS0pCamsr/cCP5Z2Vl6SzPzMzU2W/EiBFo1qwZWJbFgAEDcPr0aX7fBw8eoHfv3khMTIRIJMI333xT4LyzZs2Cv78/srKy0KNHD1y7dg0AoFAosH79esyfPx8AMGHCBISGhlryEpSIWq3GrVu3CsxSQIiQUBwToaMYJkJHMUyEjmKYCB3FMBGSMjfMfMOGDXUGJuesXLkSK1eu5P8/ZswYbNu2jf+/SCTCH3/8gQ4dOuD+/fvo2LEjnJ2dYWdnB6lUCgCws7PD2rVr0b59+wLH9/DwwJEjR9ClSxfcv38fTZo0gZubG3JycqBUKgHkdeH7/vvvzVxj82JZFtnZ2TrjaREiNBTHROgohonQUQwToaMYJkJHMUyEpMwlpkrC398f169fx88//4x9+/bh8ePHyM7ORnBwMNq3b4/p06frDHSeX+PGjXHv3j0sX74cR44cQWxsLFxcXFC3bl2MGTMG48aNs1pTSpZloVQq+VZghqhUKgBATk4OTY9JBMvScSwSiWBnZ6czCQIhhAiJqc8FxUXPE0ToKIaJ0FEM0zO7kDAspVBtXlxcHIKCghAbG4vAwMAi7atWq5GamgqZTMa33DKGZVkoFArY29vTG5gIVmnEsZ2dHdzc3FCxYkWTZvUkpChUKhWuXr2KJk2alNuHSWIZRX0uKC56niBCRzFMhI5iOE9JntlL8jmcFA097ZZharUasbGxyM3NhYeHB1xdXSEWi43emFiWhVqtLnQ7QmyZJeOYO3ZmZibS09ORnZ2NoKAgSk4RsxKLxahVqxbFFTGr4jwXFBc9TxChoxgmQlfeY5ie2YWFElNlWGpqKnJzc1GlShU4OTlZuziElCmurq7w8PDAixcvkJqaCj8/P2sXiZQ
"text/plain": [
"<Figure size 1200x800 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# Left axis: Plotting the histogram for mapping_count\n",
"fig, ax1 = plt.subplots(figsize=(12, 8))\n",
"\n",
"# Histogram for mapping_count\n",
"ax1.hist(thing_property_grouped['mapping_count'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)\n",
"ax1.set_xlabel('Mapping Count', fontsize=24, color='black') # X-axis label with larger font\n",
"ax1.set_ylabel('Frequency', fontsize=24, color='black') # Y-axis label with larger font\n",
"ax1.grid(True, linestyle='--', alpha=0.7)\n",
"\n",
"# Set axis color to black\n",
"ax1.spines['bottom'].set_color('black')\n",
"ax1.spines['top'].set_color('black') \n",
"ax1.spines['right'].set_color('black')\n",
"ax1.spines['left'].set_color('black')\n",
"\n",
"# Make tick labels larger\n",
"ax1.tick_params(axis='x', colors='black', labelsize=18)\n",
"ax1.tick_params(axis='y', colors='black', labelsize=18)\n",
"\n",
"# Right axis: Plotting unique_token_count min, max, and average\n",
"ax2 = ax1.twinx()\n",
"\n",
"# Group by mapping_count to calculate min, max, and average of unique_token_count\n",
"grouped_token_stats = thing_property_grouped.groupby('mapping_count')['unique_token_count'].agg(['min', 'max', 'mean']).reset_index()\n",
"\n",
"# Plot the min-max range as a shaded area\n",
"ax2.fill_between(grouped_token_stats['mapping_count'],\n",
" grouped_token_stats['min'],\n",
" grouped_token_stats['max'],\n",
" color='lightgray', alpha=0.5, label='Min-Max Range')\n",
"\n",
"# Plot the average unique_token_count as a line\n",
"ax2.plot(grouped_token_stats['mapping_count'],\n",
" grouped_token_stats['mean'],\n",
" color='red', marker='o', linestyle='-', label='Average Unique Token Count')\n",
"\n",
"ax2.set_ylabel('Unique Token Count (Min/Max/Avg)', fontsize=24, color='black') # Larger font for right Y-axis label\n",
"ax2.tick_params(axis='y', colors='black', labelsize=18)\n",
"\n",
"# Add legends\n",
"ax1.legend(['Frequency'], loc='upper left', fontsize=18)\n",
"ax2.legend(loc='upper right', fontsize=18)\n",
"\n",
"# Add a logarithmic trendline\n",
"# Applying log to mapping_count for the trendline\n",
"log_mapping_count = np.log(grouped_token_stats['mapping_count'])\n",
"\n",
"# Fit a linear model on the log of the mapping_count\n",
"z = np.polyfit(log_mapping_count, grouped_token_stats['mean'], 1) # Linear fit on log-transformed data\n",
"p = np.poly1d(z)\n",
"\n",
"# Generate x values and corresponding y values for the trendline\n",
"x_vals = np.linspace(grouped_token_stats['mapping_count'].min(), grouped_token_stats['mapping_count'].max(), 500)\n",
"log_x_vals = np.log(x_vals)\n",
"y_vals = p(log_x_vals)\n",
"\n",
"# Plot the logarithmic trendline\n",
"ax2.plot(x_vals, y_vals, color='green', linestyle='--', label='Logarithmic Trendline')\n",
"\n",
"# Add the trendline to the legend\n",
"ax2.legend(loc='upper right', fontsize=18)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}