hipom_data_mapping/post_process/tfidf_class/2z.plot_cluster.ipynb

208 lines
1.5 MiB
Plaintext
Raw Normal View History

2024-09-25 08:52:30 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABW0AAAKyCAYAAACuWPzHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxU9b3/8dfsSyaTdbKRnYSwE/ZNRRBFUbFQN6jr9WJr1d+9cntta3t729tquV6LtqWtlVbFJSpqsS4UFIOoIDthhxCy75N1Mpl95vz+CIxEwipLpn6ej4ePxyRz5sx3zrxzPHzmO5+vSlEUBSGEEEIIIYQQQgghhBD9gvpSD0AIIYQQQgghhBBCCCHEl6RoK4QQQgghhBBCCCGEEP2IFG2FEEIIIYQQQgghhBCiH5GirRBCCCGEEEIIIYQQQvQjUrQVQgghhBBCCCGEEEKIfkSKtkIIIYQQQgghhBBCCNGPSNFWCCGEEEIIIYQQQggh+hEp2gohhBBCCCGEEEIIIUQ/or3UA/gmCIVC1NfXEx0djUqlutTDEUIIIYQQQgghhBBCXAKKotDV1UVaWhpq9cnn00rR9iKor68nIyPjUg9DCCGEEEIIIYQQQgjRD9TU1JCenn7S+6VoexFER0cDPW+G1Wq9xKMR/wzq6+tJS0u71MMQ4qxJdkWkkuyKSCXZFZFM8isilWRXRCrJ7sXhcDjIyMgI1wtPRoq2F8GxlghWq1WKtuK8iIqKQqPRXOphCHHWJLsiUkl2RaSS7IpIJvkVkUqyKyKVZPfiOl0LVVmITIgIVFVVdamHIMQ5keyKSCXZFZFKsisimeRXRCrJrohUkt3+RYq2QgghhBBCCCGEEEII0Y9I0VaICBQXF3ephyDEOZHsikgl2RWRSrIrIpnkV0Qqya6IVJLd/kWKtkJEIOkxIyKVZFdEKsmuiFSSXRHJJL8iUkl2RaSS7PYvEVe0/eKLL/j+97/PmDFjiI+PR6fTYbVayc/P59Zbb6WoqAiv13vG+1MUhZUrV3LzzTczcOBATCYTNpuNcePG8Ytf/ILq6uoL+GqEODctLS2XeghCnBPJrohUkl0RqSS7IpJJfkWkkuyKSCXZ7V+0l3oAZ6q1tZX77ruPv//97yfc19XVRVdXF2VlZbz55pv87Gc/Y/ny5UydOvWU+6yvr+fOO++kuLi41+89Hg8tLS1s376dp556it///vfcc8895/PlCCGEEEIIIYQQQgghRJ9UiqIol3oQp+N2u5kyZQolJSXh39lsNkaPHk16ejp2u519+/ZRXl4evt9sNlNcXMzEiRP73KfD4WDq1Kns3bs3/LsJEyYwbNgwOjs7KS4upqOjI3zf8uXLueuuu85p/A6Hg5iYGDo7O7Faree0DyGO5/P50Ov1l3oYQpw1ya6IVJJdEakkuyKSSX5FpJLsikgl2b04zrROGBHtEf73f/83XLBVqVT86le/oqqqijVr1vDXv/6Vd999l7KyMl577TViYmIAcLlcLFy48KT7fOihh8IF2/j4eD7++GM2b97M888/z9tvv01tbS3f+c53wtsvXLiQsrKyC/cihTgLbW1tl3oIQpwTya6IVJJdEakkuyKSSX5FpJLsikgl2e1fIqJo++KLL4Zv/7//9//4yU9+gslk6rWNSqXi9ttv5y9/+Uv4d3v27GHPnj0n7G/v3r28+uqr4Z+LioqYMWNGr22ioqJ46aWXmDJlCtDzacPPfvaz8/FyhPjaXC7XpR6CEOdEsisilWRXRCrJrohkkl8RqSS7IlJJdvuXfl+0dTgcVFVVhX+eP3/+Kbf/1re+hdlsDv9cWlp6wjZ/+tOfCIVCAFx99dXMmjWrz32p1WqefPLJ8M8rVqyQpsyiX9DpdJd6CEKcE8muiFSSXRGpJLsikkl+RaSS7IpIJdntX/p90dbpdPb6OS4u7pTba7XaXv0gjhVnj1EUhXfffTf887333nvK/U2dOpW8vDwAgsFgr8cKcakMGDDgUg9BiHMi2RWRSrIrIpVkV0Qyya+IVJJdEakku/1Lvy/a2mw2jEZj+Od9+/adcnu73U5zc3P451GjRvW6//Dhw9TW1oZ/vvLKK087hunTp4dvFxcXn3Z7IS60ysrKSz0EIc6JZFdEKsmuiFSSXRHJJL8iUkl2RaSS7PYv/b5oq9PpuO6668I//+pXvzplj40f/vCH4dm1V111FYMGDep1/4EDB8K3U1JSSE1NPe0YxowZ0+fjhRBCCCGEEEIIIYQQ4nzr90VbgCeeeAKLxQLAjh07GDlyJMuXL6esrAyPx0NNTQ0ffPABl19+OS+88AIAQ4cODd8+3qFDh8K3s7Kyzuj5MzMzw7cPHjz4dV6KEOdFbGzspR6CEOdEsisilWRXRCrJrohkkl8RqSS7IlJJdvsX7aUewJkYPHgwGzZs4MYbb6S6upojR45wzz339LltbGwsd955J48//jjR0dEn3N/a2hq+nZycfEbPn5KSEr7tcrnwer0YDIazexFCnEd6vf5SD0GIcyLZFZFKsisilWRXRDLJr4hUkl0RqSS7/UtEzLQFGDlyJKWlpSxdupSoqKiTbjdr1izmz5/fZ8EWei9sZjKZzui5v7rdVxdH+yqv14vD4ej1nxDn0/F9m4WIJJJdEakkuyJSSXZFJJP8ikgl2RWRSrLbv0TETFuAlpYWHn30UV555RX8fj8pKSlMmTKFxMREOjo62Lx5M1VVVbzxxhu88cYb3H///fzxj39Eo9H02o/H4wnfPtNPEL46q9btdp9y+1//+tf84he/OOH3FRUVREdHk5WVRUNDAz6fD5PJRGJiIjU1NQAkJCSgKAptbW1AT2uG5uZmPB4PBoOB5ORkqqurAYiPj0etVtPS0gJAeno6ra2tuN1u9Ho9aWlp4SbSsbGx6HQ67HY70LMiYEdHB93d3Wi1WjIyMqioqAAgJiYGg8EQ/mNNS0vD4XDgdDrRaDRkZWVRUVGBoihYrVbMZjONjY1Az6zk7u5uurq6UKvVZGdnU1VVRTAYxGKxEB0dTUNDA9Az09nj8dDZ2QlAbm4u1dXVBAIBoqKiiI2Npa6uDoCkpCR8Ph8dHR0AZGdnU1dXh9/vx2w2Ex8fH15gLjExkWAwSHt7O9DTBqOxsRGv14vRaMRms/U63vDlDOyMjAzsdnv4eKekpFBVVQVAXFwcGo2m1/Fua2vD5XKh0+kYMGBAr+Ot1+vDx/CrxzszM5Py8vLw8TYajTQ1NQGQmppKV1dXr+NdWVlJKBQiOjqaqKio8HhTUlJwuVw4HA5UKhU5OTm9jrfVaqW+vj58DL1eb/h45+TkUFNT0+fxttls+P3+Xse7vr4+nNmEhIRexzsUCvXKbFNTU/h4JyUl9cqsSqXqdbxbWlrCmU1NTT3j452enh7O7FeP9+kyazKZeh1vp9PZK7PHH2+LxdIrs263+4yP9/GZzcnJoba29owz+896jvD5fNjtdjlHXOBzxPHnZDlHnJ9zRGtrq5wj5DoiIs8RgJwj+jjech0RGecIp9OJy+WSc4RcR0TcOeLYGOQc8WVm5ToiMs4Rfr+f5uZmOUdc4HPEsceejkpRFOWMtryEDh8+zIwZM6itrcVgMPCb3/yG7373u2i1X9acFUXh9ddf53vf+154ZusDDzzAH//4x177evDBB8O/u+2223j99ddP+/wHDhxg6NCh4Z9bWlrCf1x98Xq9eL3e8M8Oh4OMjAw6OzuxWq1n9qKFOAVp0SEilWRXRCrJrohUkl0RySS/IlJJdkWkkuxeHA6Hg5iYmNPWCfv9TNtAIMC8efPCVfRnn322z362KpWK+fPnk5iYyDXXXAPAn/70J+655x4mTJgQ3u7YgmZw+hmzJ9vu+H30xWAwSMjFBdXR0XHGPZmF6E8kuyJSeL1e6uvr8Xg8GI3G8CfqQkSaS3HetdvtFBcX09bWRnx8PDNmzMBms52XfdfW1rJy5Ursdjs2m425c+fK3+Y/MbluEJFKsisilWS3f+n3Rdu3336bvXv3AlBQUMDdd999yu2vvvpqZs6cydq1awF44YUXehVtj58he2z69+kcm/o
"text/plain": [
"<Figure size 1400x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"train_all_path = '../../data_preprocess/dataset/1/train_all.csv'\n",
"test_path = '../../translation/0.result/1/test_p.csv'\n",
"\n",
"train_df = pd.read_csv(train_all_path)\n",
"test_df = pd.read_csv(test_path)\n",
"\n",
"data = pd.concat([train_df, test_df], ignore_index=True)\n",
"data = data[data['MDM'] == True]\n",
"\n",
"vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n",
"tfidf_matrix = vectorizer.fit_transform(data['tag_description'])\n",
"\n",
"grouped_data = data.groupby(['thing', 'property']).agg(\n",
" tp_count=('thing', 'size'), pattern=('pattern', 'first')).reset_index()\n",
"\n",
"result = grouped_data.loc[grouped_data.groupby('pattern')['tp_count'].idxmax()].reset_index(drop=True)\n",
"\n",
"top_n = 20\n",
"top_n_patterns = result.nlargest(top_n, 'tp_count')\n",
"\n",
"filtered_data = data[data['pattern'].isin(top_n_patterns['pattern'])]\n",
"filtered_data.to_csv('filtered_data_plot.csv', index=False)\n",
"filtered_tfidf_matrix = vectorizer.transform(filtered_data['tag_description'])\n",
"\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_results = tsne.fit_transform(filtered_tfidf_matrix.toarray())\n",
"\n",
"unique_patterns = filtered_data['pattern'].unique()\n",
"color_map = plt.get_cmap('tab20', len(unique_patterns))\n",
"pattern_to_color = {pattern: idx for idx, pattern in enumerate(unique_patterns)}\n",
"\n",
"plt.figure(figsize=(14, 7))\n",
"\n",
"for pattern, color_idx in pattern_to_color.items():\n",
" pattern_indices = filtered_data['pattern'] == pattern\n",
" plt.scatter(tsne_results[pattern_indices, 0], tsne_results[pattern_indices, 1], \n",
" color=color_map(color_idx), marker='o', s=100, alpha=0.6, edgecolor='k', linewidth=1.2)\n",
"\n",
"plt.xticks(fontsize=24)\n",
"plt.yticks(fontsize=24)\n",
"plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.6)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABW0AAAKwCAYAAADjkF3MAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZhU5Zn///c5ta/dXb1X702zNfuiyCbQoigGDbggJGgSNYmTZDLjTCaayTeJSWbiL8mYzJiJmWjcUFyRIEpEoLHZdxBoaOh9q16q1+rat/P7o6FCCypiIdD9vK4rV6q6Tp16zqlPn2pun7ofSVEUBUEQBEEQBEEQBEEQBEEQBOGKIF/uAQiCIAiCIAiCIAiCIAiCIAh/J4q2giAIgiAIgiAIgiAIgiAIVxBRtBUEQRAEQRAEQRAEQRAEQbiCiKKtIAiCIAiCIAiCIAiCIAjCFUQUbQVBEARBEARBEARBEARBEK4gomgrCIIgCIIgCIIgCIIgCIJwBRFFW0EQBEEQBEEQBEEQBEEQhCuIKNoKgiAIgiAIgiAIgiAIgiBcQUTRVhAEQRAEQRAEQRAEQRAE4QoiiraCIAiCIAiCIAiCIAiCIAhXkKuuaLtr1y7+4R/+gcmTJ2Oz2dBoNFitVoYPH87dd9/NqlWrCAQCF7w/RVFYs2YNd955J8OGDcNgMJCamsrUqVN57LHHaGhouIRHIwiCIAiCIAiCIAiCIAiCMJCkKIpyuQdxITo7O7n//vtZu3btp247bNgwXnjhBWbOnPmJ2zkcDlasWEFpaenHbmM2m3nyySf52te+9lmHHBONRnE4HFgsFiRJuuj9CIIgCIIgCIIgCIIgCIJw9VIUhb6+Pux2O7L88fNpr4qirc/nY8aMGRw+fDj2s9TUVCZNmkR2djZOp5Py8nJqampijxuNRkpLS5k2bdp59+lyuZg5cybHjh2L/ezaa69lzJgx9Pb2UlpaSk9PT+yxF154gXvvvfeixt/U1EROTs5FPVcQBEEQBEEQBEEQBEEQhMGlsbGR7Ozsj338qija/uxnP+Oxxx4DQJIkfvGLX/Dwww9jMBhi2yiKwmuvvca3v/1tent7ARg3bhxHjhw57z7vvfdeVq5cCYDNZuONN96gpKQk9rjH4+Fb3/oWL7/8MgBarZby8nKKioo+8/h7e3tJTEyksbERq9X6mZ9/tXO5XEPyuIX4E1kS4kVkSYgXkSUhXkSWhHgRWRLiRWRJiBeRJSFeBkuWXC4XOTk59PT0kJCQ8LHbqb/AMV20559/Pnb7H//xH/n3f//3c7aRJIl77rkHtVrNXXfdBcDRo0c5evQo48aNG7DtsWPHYsVYgFWrVg0o2AKYTCZefPFFamtr2blzJ8FgkJ/85CesWrXqM4//TEsEq9U6KML1WXV0dAzJ4xbiT2RJiBeRJSFeRJaEeBFZEuJFZEmIF5ElIV5EloR4GWxZ+rQWqlf8QmQul4v6+vrY/WXLln3i9l/+8pcxGo2x+6dOnTpnm6eeeopoNArAjTfeyIIFC867L1mW+fWvfx27//rrr9PR0fGZxi8IgiAIgiAIgiAIgiAIgvBZXPFFW7fbPeB+UlLSJ26vVqsHVN3PFGfPUBSFt99+O3b/61//+ifub+bMmbGWCJFIZMBzhQvzSf05BOGzEFkS4kVkSYgXkSUhXkSWhHgRWRLiRWRJiBeRJSFehlqWrviibWpqKnq9Pna/vLz8E7d3Op20t7fH7k+YMGHA45WVlTQ1NcXuz50791PHMG/evNjt0tLST91eGKirq+tyD0EYJESWhHgRWRLiRWRJiBeRJSFeRJaEeBFZEuJFZEmIl6GWpSu+aKvRaLjlllti93/5y1/i9Xo/dvsf/vCHsdm1N9xwAyNGjBjw+IkTJ2K3MzIyyMzM/NQxTJ48+bzPFy7MJ71fgvBZiCwJ8SKyJMSLyJIQLyJLQryILAnxIrIkxIvIkhAvQy1LV3zRFuA///M/MZvNABw8eJDx48fzwgsvUFVVhd/vp7GxkXfffZfZs2fz3HPPAVBcXBy7fbaTJ0/Gbufl5V3Q6+fm5sZuV1RUfJ5DGZI0Gs3lHoIwSIgsCfEisiTEi8iSEC8iS0K8iCwJ8SKyJMSLyJIQL0MtS+rLPYALMWrUKHbs2MGiRYtoaGigurqar33ta+fdNjExkRUrVvAf//EfWCyWcx7v7OyM3U5PT7+g18/IyIjd9nq9BAIBdDrdx24fCAQIBAKx+y6X64JeZ7DKysq63EMQBgmRJSFeRJaEeBFZEuJFZEmIF5ElIV5EloR4EVkS4mWoZemqKNoCjB8/nlOnTvHMM8/wwx/+EI/Hc97tFixYwLJly85bsIWBC5sZDIYLeu2Pbud2uz+xaPurX/2Kxx577Jyf19bWYrFYyMvLo6WlhWAwiMFgICUlhcbGRgCSk5NRFCXWpyM3N5f29nb8fj86nY709HQaGhoAsNlsyLJMR0cH0N+QubOzE5/Ph1arxW63U1dXB/QXszUaDU6nE+gPek9PDx6PB7VaTU5ODrW1tQAkJCSg0+livYHtdjsulwu3241KpSIvL4/a2loURcFqtWI0GmltbQX6C9wej4e+vj5kWSY/P5+DBw9is9kwm81YLBZaWlqA/qK53++nt7cXgMLCQhoaGgiHw5hMJhITE2lubgYgLS2NYDBIT08PAPn5+TQ3NxMKhTAajdhstliv4pSUFCKRCN3d3UD/jOrW1lYCgQB6vZ7U1NQB5xv+XszPycnB6XTGzndGRgb19fVA/yJ4KpVqwPnu6urC6/Wi0WjIysoacL61Wm3sHH70fOfm5lJTUxM733q9nra2NgAyMzPp6+sbcL7r6uqIRqNYLBZMJtOA8+31enG5XEiSREFBAfX19UQiEcxmM1arFYfDETuHgUAgdr4LCgpobGw87/lOTU0lFAoNON8OhyOW2eTk5AHnOxqNDshsW1tb7HynpaUNyKwkSQPOd0dHRyyzmZmZn3i+jx8/jtVqRaPRkJ2dHcvsR8/3p2XWYDAMON9ut3tAZs8+32azeUBmfT7fBZ/vszNbUFBAU1PTBWd2KF0jzj6HX9Q1oqGhAaPRGDvf4hoxOK4RZ5/vL+oa0dnZiVarjWVWXCMGxzXicvwdEQwGkSTpvJkV14ir9xpxOf6OiEaj6PV6cY0YZNeIy/F3RDAYJCUlRVwjBtk14nL8HeH1esnLyxPXiEF2jbgcf0e4XC6Ki4uv+mvEmed+GklRFOWCtrzMOjo6+Ld/+zdeeuklQqEQGRkZzJgxg5SUFHp6etizZ0/s5AJ885vf5I9//CMqlWrAfu6//36effZZAFasWMGLL774qa9dU1PDsGHDYvcbGxs/ccW68820zcnJobe3F6vVesHHPFjU1NRQWFh4uYchDAIiS0K8iCwJ8SKyJMSLyJIQLyJLQryILAnxIrIkxMtgyZLL5SIhIeFT64RXxUzbyspKSkpKaGpqQqfT8Yc//IFvfetbqNV/H76iKLz66qt8+9vfxuVy8ec//xmVSsUf//jHAfvS6/Wx28Fg8IJe/+wCLHz6DF2dTveJM3GHmsTExMs9BGGQEFkS4kVkSYgXkSUhXkSWhHgRWRLiRWRJiBeRJSFehlqWrviFyMLhMEuWLIlNff7Tn/7Ed77znQEFWwBJkli2bBlvvvlm7GdPPfUUe/fuHbDdmQXNAHw+3wWN4aPbnb0P4dOd+dqoIHxeIktCvIgsCfEisiTEi8iSEC8iS0K8iCwJ8SKyJMTLUMvSFV+0Xb16NceOHQNg5MiR3HfffZ+4/Y033sj8+fNj95977rkBj5/pGQLEenZ8mjP9OgCMRqOYRfsZnenrIQifl8iSEC8iS0K8iCwJ8SKyJMSLyJIQLyJLQryILAnxMtSydMUXbd97773Y7Xnz5sUWaPgkJSUlsdv79+8f8NjIkSNjt8/ugftJzjQtBhg1atQFPUcQBEEQBEEQBEEQBEEQBOFiXPFF2zOrx8HAWbKfJCUlJXb7zMp0Z4wePTp2u7W1dcAs2o9z8ODB8z5fuDBZWVmXewjCICGyJMSLyJIQLyJLQryILAnxIrIkxIv
"text/plain": [
"<Figure size 1400x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# 'filtered_data_plot.csv' 파일을 읽기\n",
"filtered_data = pd.read_csv('filtered_data_plot.csv')\n",
"\n",
"# BoW로 벡터화 (token_pattern=r'\\S+'로 설정하여 모든 non-whitespace를 토큰으로 사용)\n",
"vectorizer = CountVectorizer(token_pattern=r'\\S+')\n",
"bow_matrix = vectorizer.fit_transform(filtered_data['tag_description'])\n",
"\n",
"# t-SNE 차원 축소\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_results = tsne.fit_transform(bow_matrix.toarray())\n",
"\n",
"# 시각화를 위한 준비\n",
"unique_patterns = filtered_data['pattern'].unique()\n",
"color_map = plt.get_cmap('tab20', len(unique_patterns))\n",
"pattern_to_color = {pattern: idx for idx, pattern in enumerate(unique_patterns)}\n",
"\n",
"plt.figure(figsize=(14, 7))\n",
"\n",
"# 각 패턴별로 시각화\n",
"for pattern, color_idx in pattern_to_color.items():\n",
" pattern_indices = filtered_data['pattern'] == pattern\n",
" plt.scatter(tsne_results[pattern_indices, 0], tsne_results[pattern_indices, 1], \n",
" color=color_map(color_idx), marker='o', s=100, alpha=0.6, edgecolor='k', linewidth=1.2)\n",
"\n",
"# 그래프 설정\n",
"plt.xticks(fontsize=24)\n",
"plt.yticks(fontsize=24)\n",
"plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.6)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABW0AAAKyCAYAAACuWPzHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxU9b3/8deZPZlksu97AiQECBAQNxZBLIorblAUK7ZqLdre6/3V2trb4m17q/aW1qvWtnpFBRHUiruoiOyyhjWEsGSd7AvJJJPZ5/z+SDMSASEqSab5PB8PH51Jzpz5npN3DzOf+c7nq6iqqiKEEEIIIYQQQgghhBBiUNAM9ACEEEIIIYQQQgghhBBCfEGKtkIIIYQQQgghhBBCCDGISNFWCCGEEEIIIYQQQgghBhEp2gohhBBCCCGEEEIIIcQgIkVbIYQQQgghhBBCCCGEGESkaCuEEEIIIYQQQgghhBCDiBRthRBCCCGEEEIIIYQQYhCRoq0QQgghhBBCCCGEEEIMIrqBHsBQ4Pf7qa2tJTw8HEVRBno4QgghhBBCCCGEEEKIAaCqKh0dHSQnJ6PRnHk+rRRt+0FtbS1paWkDPQwhhBBCCCGEEEIIIcQgUF1dTWpq6hl/L0XbfhAeHg50/zEsFssAj0b0qK2tJTk5eaCHIUSfSG5FMJLcimAkuRXBRjIrgpHkVgQjya34pmw2G2lpaYF64ZlI0bYf9LREsFgsUrQdRMxmM1qtdqCHIUSfSG5FMJLcimAkuRXBRjIrgpHkVgQjya34tpythaosRCaGrMrKyoEeghB9JrkVwUhyK4KR5FYEG8msCEaSWxGMJLeiv0jRVgghhBBCCCGEEEIIIQYRKdqKISsqKmqghyBEn0luRTCS3IpgJLkVwUYyK4KR5FYEI8mt6C9StBVDlvSgEcFIciuCkeRWBCPJrQg2klkRjCS3IhhJbkV/kaKtGLKam5sHeghC9JnkVgQjya0IRpJbEWwksyIYSW5FMJLciv4iRVshhBBCCCGEEEIIIYQYRKRoK4as1NTUgR6CEH0muRXBSHIrgpHkVgQbyawIRpJbEYwkt6K/SNFWDFmtra0DPQQh+kxyK4KR5FYEI8mtCDaSWRGMJLciGEluRX+Roq0Ysrq6ugZ6CEL0meRWBCPJrQhGklsRbCSzIhhJbkUwktyK/iJFWzFk6fX6gR6CEH0muRXBSHIrgpHkVgQbyawIRpJbEYwkt6K/KKqqqgM9iH91NpuNiIgI2tvbsVgsAz0c8U9+vx+NRj63EMFFciuCkeRWBCPJrQg2klkRjCS3IhhJbsU3da51QkmZGLIqKioGeghC9JnkVgQjya0IRpJbEWwksyIYSW5FMJLciv4iRVshhBBCCCGEEEIIIYQYRKRoK4asyMjIgR6CEH0muRXBSHIrgpHkVgQbyawIRpJbEYwkt6K/BF3R9vPPP+dHP/oRhYWFREdHo9frsVgsDB8+nFtvvZUVK1bgcrnOeX+qqrJ69WpuvvlmcnJyCAkJIS4ujokTJ/Loo49SVVV1Ho9GDCSDwTDQQxCizyS3IhhJbkUwktyKYCOZFcFIciuCkeRW9JegWYispaWF73//+7z99ttn3TYnJ4eXXnqJSy+99Cu3q62tZcGCBaxbt+6M24SFhfHUU09x55139nXIAbIQ2eBUVlZGdnb2QA9DiD6R3IpgJLkVwUhyK4KNZFYEI8mtCEaSW/FNnWudUNePY/raHA4HM2fOZO/evYGfxcXFMX78eFJTU2lqaqK4uJiysjIAjh8/zne+8x3WrVvHhRdeeNp92mw2Zs2axcGDBwM/mzRpEqNGjaK9vZ1169bR1tZGZ2cnCxcuRKPRcMcdd5zX4xRCCCGEEEIIIYQQQoigmGm7ePFiHn30UQAUReE3v/kNDz74ICEhIYFtVFVl1apV/PCHP6S9vR2AMWPGsH///tPu84477mDZsmUAREdH8/rrrzNjxozA7+12O/feey+vvPIK0D39vbi4mGHDhvV5/DLTdnByuVwYjcaBHoYQfSK5FcFIciuCkeRWBBvJrAhGklsRjCS34ps61zphUPS0ffHFFwO3f/zjH/PII4/0KthCdzF33rx5PP/884GfHThwgAMHDpyyv4MHDwaKsQArVqzoVbAFMJvNvPzyy1xyySUAuN1ufvWrX30bhyMGiba2toEeghB9JrkVwUhyK86nQ4cO8ZOf/IR58+bxk5/8hEOHDn0r+5XcimAjmRXBSHIrgpHkVvSXQV+0tdlsVFZWBu5/97vf/crtb7jhBkJDQwP3jxw5cso2zz77LH6/H4ArrriCWbNmnXZfGo2GJ554InD/tddeo7m5uU/jF4OX3W4f6CEI0WeSWxGMJLfi23Jygfaiiy4iIiKCiRMn8sYbb7BlyxbeeOMNJk6cSExMDL/73e++0XNJbkWwkcyKYCS5FcFIciv6y6Av2nZ2dva6HxUV9ZXb63S6XlOLe4qzPVRV5Z133gncX7hw4Vfu79JLLw20RPD5fL0eK4KbThcULZ2F6EVyK4KR5FZ8U7/73e+IiYkJFGg3btxIRUUFBoOBYcOGcc011/CDH/yA22+/nRkzZhAfH89jjz1GeHj41559K7kVwUYyK4KR5FYEI8mt6C+DPmlxcXGYTCacTicAxcXFjBgx4ozbNzU10djYGLg/duzYXr8/evQoVqs1cP+yyy476ximT5/OsWPHAFi3bh133XVXXw5BDFLp6ekDPQQh+kxyK4KR5FZ8E7m5udTX15Oamkp2djZpaWn4/X50Oh3Nzc00Nzdz+PBhkpKSuOiii1AUJfChv81mY8WKFTz33HOEhITw4IMP8sgjj5zT80puRbCRzIpgJLkVwUhyK/rLoC/a6vV6rrrqKlavXg3Ab3/7W2bNmtWrBcLJfvaznwVm115++eWnFHhLSkoCtxMTE0lKSjrrGAoLC0/7eBHcysrKyM7OHuhhCNEnklsRjCS34uvKzc2lra2NSy65hFtuuYXExEQOHz5MeHg4eXl5aLVaKisree+993jnnXd45513SE5OJicnhyuvvJKGhgaamppobm6murqaJ554gpdffpnS0tKzPrfkVgQbyawIRpJbEYwkt6K/DPqiLcB///d/88knn9DZ2UlRUREFBQX853/+J5deeimpqak0NTWxf/9+HnvsMTZv3gxAfn4+S5cuPWVfJ79Iz8jIOKfnP/lTlMOHD3/DoxFCCCGEEGfzu9/9jvr6ei655BJ+8pOfEBkZSXFxMTqdjoyMDPR6PQBZWVlce+21tLS0UFlZyezZsxk3bhwajYYTJ05w4MABdDodkZGRvP766xQVFZGbm3tOhVshhBBCCCEGyqDvaQuQl5fHli1bAsXT48ePc+eddzJ8+HBCQkJIT0/nmmuuYfPmzURGRvLAAw+wbds20tLSTtlXS0tL4HZCQsI5PX9iYmLgdldXFy6X6xsekRgMIiIiBnoIQvSZ5FYEI8mt+DqWLFlCamoqt9xyC5GRkbjdbmw2G6GhoYSHhwe26+rq4sSJE1x22WVkZmZitVrRaLpf4kZGRhIeHk5nZydZWVn85Cc/obCwkPr6+rMuVCa5FcFGMiuCkeRWBCPJregvQVG0BSgoKODIkSM8/fTTmM3mM243a9Ysvvvd7/Z6MX+ykxc2CwkJOafn/vJ2X14c7ctcLhc2m63Xf2LwMZlMAz0EIfpMciuCkeRW9NWhQ4dwOBxkZ2eTl5cHQHt7O9C9KK2iKIFtm5ubge52VsnJyRQXFwfWQlAUhejo6ED/28jISG655RZSU1NZsmTJV45BciuCjWRWBCPJrQhGklvRX4KiPQJ0vyB/6KGHWL58OR6Ph8TERC655BJiY2Npa2tj+/btVFZWsmrVKlatWsU999zDX/7yF7Raba/99LyIBzAYDOf03Eajsdd9h8Pxldv//ve/59FHHz3l5+Xl5YSHh5ORkUFdXR1ut5uQkBBiY2Oprq4GICYmBlVVaW1tBbpbMzQ2NuJ0OjEajSQkJFBVVQVAdHQ0Go0m8GYlNTWVlpYWHA4HBoOB5ORkKioqgO6ZJnq9nqamJgBSUlJ
"text/plain": [
"<Figure size 1400x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from gensim.models import Word2Vec\n",
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"def compute_sentence_vector(sentence, model, vector_size):\n",
" words = sentence.split()\n",
" word_vectors = [model.wv[word] for word in words if word in model.wv]\n",
" return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(vector_size)\n",
"\n",
"filtered_data = pd.read_csv('filtered_data_plot.csv')\n",
"\n",
"sentences = [desc.split() for desc in filtered_data['tag_description'].tolist()]\n",
"\n",
"vector_size = 200\n",
"window_size = 3\n",
"model = Word2Vec(sentences, vector_size=vector_size, window=window_size, min_count=1, workers=4)\n",
"\n",
"sentence_vectors = np.array([compute_sentence_vector(desc, model, vector_size) for desc in filtered_data['tag_description']])\n",
"\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_results = tsne.fit_transform(sentence_vectors)\n",
"\n",
"unique_patterns = filtered_data['pattern'].unique()\n",
"color_map = plt.get_cmap('tab20', len(unique_patterns))\n",
"pattern_to_color = {pattern: idx for idx, pattern in enumerate(unique_patterns)}\n",
"\n",
"plt.figure(figsize=(14, 7))\n",
"\n",
"for pattern, color_idx in pattern_to_color.items():\n",
" pattern_indices = filtered_data['pattern'] == pattern\n",
" plt.scatter(tsne_results[pattern_indices, 0], tsne_results[pattern_indices, 1], \n",
" color=color_map(color_idx), marker='o', s=100, alpha=0.6, edgecolor='k', linewidth=1.2)\n",
"\n",
"plt.xticks(fontsize=24)\n",
"plt.yticks(fontsize=24)\n",
"plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.6)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}