438 lines
21 KiB
Plaintext
438 lines
21 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
|
||
|
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
||
|
"/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
|
||
|
" warnings.warn(\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Epoch 1 completed. Loss: 5.446770191192627\n",
|
||
|
"Validation Accuracy after Epoch 1: 18.30%\n",
|
||
|
"Epoch 2 completed. Loss: 3.8084073066711426\n",
|
||
|
"Validation Accuracy after Epoch 2: 40.87%\n",
|
||
|
"Epoch 3 completed. Loss: 3.0630860328674316\n",
|
||
|
"Validation Accuracy after Epoch 3: 65.36%\n",
|
||
|
"Epoch 4 completed. Loss: 1.5352345705032349\n",
|
||
|
"Validation Accuracy after Epoch 4: 73.26%\n",
|
||
|
"Epoch 5 completed. Loss: 0.8989766836166382\n",
|
||
|
"Validation Accuracy after Epoch 5: 78.01%\n",
|
||
|
"Epoch 6 completed. Loss: 0.9589817523956299\n",
|
||
|
"Validation Accuracy after Epoch 6: 81.65%\n",
|
||
|
"Epoch 7 completed. Loss: 0.7892795205116272\n",
|
||
|
"Validation Accuracy after Epoch 7: 83.85%\n",
|
||
|
"Epoch 8 completed. Loss: 0.5069147944450378\n",
|
||
|
"Validation Accuracy after Epoch 8: 86.97%\n",
|
||
|
"Epoch 9 completed. Loss: 0.524911642074585\n",
|
||
|
"Validation Accuracy after Epoch 9: 88.12%\n",
|
||
|
"Epoch 10 completed. Loss: 0.2070937305688858\n",
|
||
|
"Validation Accuracy after Epoch 10: 89.94%\n",
|
||
|
"Epoch 11 completed. Loss: 0.19738677144050598\n",
|
||
|
"Validation Accuracy after Epoch 11: 90.75%\n",
|
||
|
"Epoch 12 completed. Loss: 0.13339389860630035\n",
|
||
|
"Validation Accuracy after Epoch 12: 91.90%\n",
|
||
|
"Epoch 13 completed. Loss: 0.21022899448871613\n",
|
||
|
"Validation Accuracy after Epoch 13: 92.86%\n",
|
||
|
"Epoch 14 completed. Loss: 0.26752030849456787\n",
|
||
|
"Validation Accuracy after Epoch 14: 93.24%\n",
|
||
|
"Epoch 15 completed. Loss: 0.14866866171360016\n",
|
||
|
"Validation Accuracy after Epoch 15: 93.68%\n",
|
||
|
"Epoch 16 completed. Loss: 0.08989054709672928\n",
|
||
|
"Validation Accuracy after Epoch 16: 94.06%\n",
|
||
|
"Epoch 17 completed. Loss: 0.037873975932598114\n",
|
||
|
"Validation Accuracy after Epoch 17: 94.59%\n",
|
||
|
"Epoch 18 completed. Loss: 0.07367080450057983\n",
|
||
|
"Validation Accuracy after Epoch 18: 94.68%\n",
|
||
|
"Epoch 19 completed. Loss: 0.04101959988474846\n",
|
||
|
"Validation Accuracy after Epoch 19: 94.83%\n",
|
||
|
"Epoch 20 completed. Loss: 0.21339105069637299\n",
|
||
|
"Validation Accuracy after Epoch 20: 95.02%\n",
|
||
|
"Epoch 21 completed. Loss: 0.06965143978595734\n",
|
||
|
"Validation Accuracy after Epoch 21: 94.97%\n",
|
||
|
"Epoch 22 completed. Loss: 0.06043635308742523\n",
|
||
|
"Validation Accuracy after Epoch 22: 95.02%\n",
|
||
|
"Epoch 23 completed. Loss: 0.021217377856373787\n",
|
||
|
"Validation Accuracy after Epoch 23: 94.92%\n",
|
||
|
"Epoch 24 completed. Loss: 0.037467293441295624\n",
|
||
|
"Validation Accuracy after Epoch 24: 95.02%\n",
|
||
|
"Epoch 25 completed. Loss: 0.016836028546094894\n",
|
||
|
"Validation Accuracy after Epoch 25: 95.02%\n",
|
||
|
"Epoch 26 completed. Loss: 0.028664518147706985\n",
|
||
|
"Validation Accuracy after Epoch 26: 95.11%\n",
|
||
|
"Epoch 27 completed. Loss: 0.011028420180082321\n",
|
||
|
"Validation Accuracy after Epoch 27: 95.16%\n",
|
||
|
"Epoch 28 completed. Loss: 0.04282907024025917\n",
|
||
|
"Validation Accuracy after Epoch 28: 95.16%\n",
|
||
|
"Epoch 29 completed. Loss: 0.00940023921430111\n",
|
||
|
"Validation Accuracy after Epoch 29: 95.35%\n",
|
||
|
"Epoch 30 completed. Loss: 0.13019809126853943\n",
|
||
|
"Validation Accuracy after Epoch 30: 95.35%\n",
|
||
|
"Epoch 31 completed. Loss: 0.01270432397723198\n",
|
||
|
"Validation Accuracy after Epoch 31: 95.11%\n",
|
||
|
"Epoch 32 completed. Loss: 0.012832771986722946\n",
|
||
|
"Validation Accuracy after Epoch 32: 95.16%\n",
|
||
|
"Epoch 33 completed. Loss: 0.012174545787274837\n",
|
||
|
"Validation Accuracy after Epoch 33: 95.16%\n",
|
||
|
"Epoch 34 completed. Loss: 0.02090534381568432\n",
|
||
|
"Validation Accuracy after Epoch 34: 95.02%\n",
|
||
|
"Epoch 35 completed. Loss: 0.017653826624155045\n",
|
||
|
"Validation Accuracy after Epoch 35: 94.49%\n",
|
||
|
"Epoch 36 completed. Loss: 0.02190311811864376\n",
|
||
|
"Validation Accuracy after Epoch 36: 94.59%\n",
|
||
|
"Epoch 37 completed. Loss: 0.048320867121219635\n",
|
||
|
"Validation Accuracy after Epoch 37: 94.68%\n",
|
||
|
"Epoch 38 completed. Loss: 0.015598177909851074\n",
|
||
|
"Validation Accuracy after Epoch 38: 95.30%\n",
|
||
|
"Epoch 39 completed. Loss: 0.009368035942316055\n",
|
||
|
"Validation Accuracy after Epoch 39: 94.83%\n",
|
||
|
"Epoch 40 completed. Loss: 0.009023590944707394\n",
|
||
|
"Validation Accuracy after Epoch 40: 95.02%\n",
|
||
|
"Epoch 41 completed. Loss: 0.040157418698072433\n",
|
||
|
"Validation Accuracy after Epoch 41: 95.11%\n",
|
||
|
"Epoch 42 completed. Loss: 0.11878462135791779\n",
|
||
|
"Validation Accuracy after Epoch 42: 95.06%\n",
|
||
|
"Epoch 43 completed. Loss: 0.021250683814287186\n",
|
||
|
"Validation Accuracy after Epoch 43: 95.16%\n",
|
||
|
"Epoch 44 completed. Loss: 0.0023518940433859825\n",
|
||
|
"Validation Accuracy after Epoch 44: 95.16%\n",
|
||
|
"Epoch 45 completed. Loss: 0.00595875782892108\n",
|
||
|
"Validation Accuracy after Epoch 45: 95.16%\n",
|
||
|
"Epoch 46 completed. Loss: 0.0025296895764768124\n",
|
||
|
"Validation Accuracy after Epoch 46: 94.97%\n",
|
||
|
"Epoch 47 completed. Loss: 0.0753568485379219\n",
|
||
|
"Validation Accuracy after Epoch 47: 95.26%\n",
|
||
|
"Epoch 48 completed. Loss: 0.002112493384629488\n",
|
||
|
"Validation Accuracy after Epoch 48: 95.06%\n",
|
||
|
"Epoch 49 completed. Loss: 0.09600060433149338\n",
|
||
|
"Validation Accuracy after Epoch 49: 95.06%\n",
|
||
|
"Epoch 50 completed. Loss: 0.002454130444675684\n",
|
||
|
"Validation Accuracy after Epoch 50: 95.21%\n",
|
||
|
"Accuracy (MDM=True) for Group 5: 91.98%\n",
|
||
|
"Results saved to 0.class_document/distilbert/5/test_p_c.csv\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW\n",
|
||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
||
|
"import torch\n",
|
||
|
"from torch.utils.data import Dataset, DataLoader\n",
|
||
|
"import numpy as np\n",
|
||
|
"import torch.nn.functional as F\n",
|
||
|
"import os \n",
|
||
|
"\n",
|
||
|
"group_number = 5\n",
|
||
|
"train_path = f'../../data_preprocess/dataset/{group_number}/train.csv'\n",
|
||
|
"valid_path = f'../../data_preprocess/dataset/{group_number}/valid.csv'\n",
|
||
|
"test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
|
||
|
"output_path = f'0.class_document/distilbert/{group_number}/test_p_c.csv'\n",
|
||
|
"\n",
|
||
|
"train_data = pd.read_csv(train_path)\n",
|
||
|
"valid_data = pd.read_csv(valid_path)\n",
|
||
|
"test_data = pd.read_csv(test_path)\n",
|
||
|
"\n",
|
||
|
"train_data['thing_property'] = train_data['thing'] + '_' + train_data['property']\n",
|
||
|
"valid_data['thing_property'] = valid_data['thing'] + '_' + valid_data['property']\n",
|
||
|
"test_data['thing_property'] = test_data['thing'] + '_' + test_data['property']\n",
|
||
|
"\n",
|
||
|
"tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
|
||
|
"label_encoder = LabelEncoder()\n",
|
||
|
"label_encoder.fit(train_data['thing_property'])\n",
|
||
|
"\n",
|
||
|
"valid_data['thing_property'] = valid_data['thing_property'].apply(\n",
|
||
|
" lambda x: x if x in label_encoder.classes_ else 'unknown_label')\n",
|
||
|
"test_data['thing_property'] = test_data['thing_property'].apply(\n",
|
||
|
" lambda x: x if x in label_encoder.classes_ else 'unknown_label')\n",
|
||
|
"\n",
|
||
|
"label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown_label')\n",
|
||
|
"\n",
|
||
|
"train_data['label'] = label_encoder.transform(train_data['thing_property'])\n",
|
||
|
"valid_data['label'] = label_encoder.transform(valid_data['thing_property'])\n",
|
||
|
"test_data['label'] = label_encoder.transform(test_data['thing_property'])\n",
|
||
|
"\n",
|
||
|
"train_texts, train_labels = train_data['tag_description'], train_data['label']\n",
|
||
|
"valid_texts, valid_labels = valid_data['tag_description'], valid_data['label']\n",
|
||
|
"\n",
|
||
|
"train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt')\n",
|
||
|
"valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True, return_tensors='pt')\n",
|
||
|
"\n",
|
||
|
"train_labels = torch.tensor(train_labels.values)\n",
|
||
|
"valid_labels = torch.tensor(valid_labels.values)\n",
|
||
|
"\n",
|
||
|
"class CustomDataset(Dataset):\n",
|
||
|
" def __init__(self, encodings, labels):\n",
|
||
|
" self.encodings = encodings\n",
|
||
|
" self.labels = labels\n",
|
||
|
"\n",
|
||
|
" def __getitem__(self, idx):\n",
|
||
|
" item = {key: val[idx] for key, val in self.encodings.items()}\n",
|
||
|
" item['labels'] = self.labels[idx]\n",
|
||
|
" return item\n",
|
||
|
"\n",
|
||
|
" def __len__(self):\n",
|
||
|
" return len(self.labels)\n",
|
||
|
"\n",
|
||
|
"train_dataset = CustomDataset(train_encodings, train_labels)\n",
|
||
|
"valid_dataset = CustomDataset(valid_encodings, valid_labels)\n",
|
||
|
"\n",
|
||
|
"train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
|
||
|
"valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)\n",
|
||
|
"\n",
|
||
|
"model = DistilBertForSequenceClassification.from_pretrained(\n",
|
||
|
" 'distilbert-base-uncased', \n",
|
||
|
" num_labels=len(train_data['thing_property'].unique())\n",
|
||
|
")\n",
|
||
|
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
|
||
|
"\n",
|
||
|
"device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')\n",
|
||
|
"model.to(device)\n",
|
||
|
"\n",
|
||
|
"epochs = 50\n",
|
||
|
"for epoch in range(epochs):\n",
|
||
|
" model.train()\n",
|
||
|
" for batch in train_loader:\n",
|
||
|
" optimizer.zero_grad()\n",
|
||
|
" input_ids = batch['input_ids'].to(device)\n",
|
||
|
" attention_mask = batch['attention_mask'].to(device)\n",
|
||
|
" labels = batch['labels'].to(device)\n",
|
||
|
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
|
||
|
" loss = outputs.loss\n",
|
||
|
" loss.backward()\n",
|
||
|
" optimizer.step()\n",
|
||
|
" print(f\"Epoch {epoch + 1} completed. Loss: {loss.item()}\")\n",
|
||
|
"\n",
|
||
|
" # 검증 루프\n",
|
||
|
" model.eval()\n",
|
||
|
" correct, total = 0, 0\n",
|
||
|
"\n",
|
||
|
" with torch.no_grad():\n",
|
||
|
" for batch in valid_loader:\n",
|
||
|
" input_ids = batch['input_ids'].to(device)\n",
|
||
|
" attention_mask = batch['attention_mask'].to(device)\n",
|
||
|
" labels = batch['labels'].to(device)\n",
|
||
|
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
||
|
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
|
||
|
" correct += (predictions == labels).sum().item()\n",
|
||
|
" total += labels.size(0)\n",
|
||
|
"\n",
|
||
|
" valid_accuracy = correct / total\n",
|
||
|
" print(f'Validation Accuracy after Epoch {epoch + 1}: {valid_accuracy * 100:.2f}%')\n",
|
||
|
"\n",
|
||
|
"# Test 데이터 예측 및 c_thing, c_property 추가\n",
|
||
|
"test_encodings = tokenizer(list(test_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
|
||
|
"test_dataset = CustomDataset(test_encodings, torch.zeros(len(test_data))) # 레이블은 사용되지 않으므로 임시로 0을 사용\n",
|
||
|
"\n",
|
||
|
"test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n",
|
||
|
"\n",
|
||
|
"model.eval()\n",
|
||
|
"predicted_thing_properties = []\n",
|
||
|
"predicted_scores = []\n",
|
||
|
"\n",
|
||
|
"with torch.no_grad():\n",
|
||
|
" for batch in test_loader:\n",
|
||
|
" input_ids = batch['input_ids'].to(device)\n",
|
||
|
" attention_mask = batch['attention_mask'].to(device)\n",
|
||
|
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
||
|
" softmax_scores = F.softmax(outputs.logits, dim=-1)\n",
|
||
|
" predictions = torch.argmax(softmax_scores, dim=-1)\n",
|
||
|
" predicted_thing_properties.extend(predictions.cpu().numpy())\n",
|
||
|
" predicted_scores.extend(softmax_scores[range(len(predictions)), predictions].cpu().numpy())\n",
|
||
|
"\n",
|
||
|
"# 예측된 thing_property를 레이블 인코더로 디코딩\n",
|
||
|
"predicted_thing_property_labels = label_encoder.inverse_transform(predicted_thing_properties)\n",
|
||
|
"\n",
|
||
|
"# thing_property를 thing과 property로 나눔\n",
|
||
|
"test_data['c_thing'] = [x.split('_')[0] for x in predicted_thing_property_labels]\n",
|
||
|
"test_data['c_property'] = [x.split('_')[1] for x in predicted_thing_property_labels]\n",
|
||
|
"test_data['c_score'] = predicted_scores\n",
|
||
|
"\n",
|
||
|
"test_data['cthing_correct'] = test_data['thing'] == test_data['c_thing']\n",
|
||
|
"test_data['cproperty_correct'] = test_data['property'] == test_data['c_property']\n",
|
||
|
"test_data['ctp_correct'] = test_data['cthing_correct'] & test_data['cproperty_correct']\n",
|
||
|
"\n",
|
||
|
"mdm_true_count = len(test_data[test_data['MDM'] == True])\n",
|
||
|
"accuracy = (test_data['ctp_correct'].sum() / mdm_true_count) * 100\n",
|
||
|
"\n",
|
||
|
"print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n",
|
||
|
"\n",
|
||
|
"# 결과를 저장하기 전에 폴더가 존재하는지 확인하고, 없으면 생성\n",
|
||
|
"os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
|
||
|
"\n",
|
||
|
"test_data.to_csv(output_path, index=False)\n",
|
||
|
"print(f'Results saved to {output_path}')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Validation Accuracy: 95.21%\n",
|
||
|
"Accuracy (MDM=True) for Group 5: 91.98%\n",
|
||
|
"Results saved to 0.class_document/distilbert/5/test_p_c.csv\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# 검증 루프\n",
|
||
|
"model.eval()\n",
|
||
|
"correct, total = 0, 0\n",
|
||
|
"\n",
|
||
|
"with torch.no_grad():\n",
|
||
|
" for batch in valid_loader:\n",
|
||
|
" input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)\n",
|
||
|
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
||
|
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
|
||
|
" correct += (predictions == labels).sum().item()\n",
|
||
|
" total += labels.size(0)\n",
|
||
|
"\n",
|
||
|
"valid_accuracy = correct / total\n",
|
||
|
"print(f'Validation Accuracy: {valid_accuracy * 100:.2f}%')\n",
|
||
|
"\n",
|
||
|
"# Test 데이터 예측 및 c_thing, c_property 추가\n",
|
||
|
"test_encodings = tokenizer(list(test_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
|
||
|
"test_dataset = CustomDataset(test_encodings, torch.zeros(len(test_data))) # 레이블은 사용되지 않으므로 임시로 0을 사용\n",
|
||
|
"\n",
|
||
|
"test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n",
|
||
|
"\n",
|
||
|
"model.eval()\n",
|
||
|
"predicted_thing_properties = []\n",
|
||
|
"predicted_scores = []\n",
|
||
|
"\n",
|
||
|
"with torch.no_grad():\n",
|
||
|
" for batch in test_loader:\n",
|
||
|
" input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)\n",
|
||
|
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
||
|
" softmax_scores = F.softmax(outputs.logits, dim=-1)\n",
|
||
|
" predictions = torch.argmax(softmax_scores, dim=-1)\n",
|
||
|
" predicted_thing_properties.extend(predictions.cpu().numpy())\n",
|
||
|
" predicted_scores.extend(softmax_scores[range(len(predictions)), predictions].cpu().numpy())\n",
|
||
|
"\n",
|
||
|
"# 예측된 thing_property를 레이블 인코더로 디코딩\n",
|
||
|
"predicted_thing_property_labels = label_encoder.inverse_transform(predicted_thing_properties)\n",
|
||
|
"\n",
|
||
|
"# thing_property를 thing과 property로 나눔\n",
|
||
|
"test_data['c_thing'] = [x.split('_')[0] for x in predicted_thing_property_labels]\n",
|
||
|
"test_data['c_property'] = [x.split('_')[1] for x in predicted_thing_property_labels]\n",
|
||
|
"test_data['c_score'] = predicted_scores\n",
|
||
|
"\n",
|
||
|
"test_data['cthing_correct'] = test_data['thing'] == test_data['c_thing']\n",
|
||
|
"test_data['cproperty_correct'] = test_data['property'] == test_data['c_property']\n",
|
||
|
"test_data['ctp_correct'] = test_data['cthing_correct'] & test_data['cproperty_correct']\n",
|
||
|
"\n",
|
||
|
"mdm_true_count = len(test_data[test_data['MDM'] == True])\n",
|
||
|
"accuracy = (test_data['ctp_correct'].sum() / mdm_true_count) * 100\n",
|
||
|
"\n",
|
||
|
"print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n",
|
||
|
"\n",
|
||
|
"os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
|
||
|
"\n",
|
||
|
"test_data.to_csv(output_path, index=False)\n",
|
||
|
"print(f'Results saved to {output_path}')\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"ename": "NameError",
|
||
|
"evalue": "name 'pd' is not defined",
|
||
|
"output_type": "error",
|
||
|
"traceback": [
|
||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||
|
"Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# 'filtered_data_plot.csv' 읽기\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m filtered_data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfiltered_data_plot.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# 데이터 토큰화\u001b[39;00m\n\u001b[1;32m 8\u001b[0m filtered_encodings \u001b[38;5;241m=\u001b[39m tokenizer(\u001b[38;5;28mlist\u001b[39m(filtered_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtag_description\u001b[39m\u001b[38;5;124m'\u001b[39m]), truncation\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
||
|
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.manifold import TSNE\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"\n",
|
||
|
"# 'filtered_data_plot.csv' 읽기\n",
|
||
|
"filtered_data = pd.read_csv('filtered_data_plot.csv')\n",
|
||
|
"\n",
|
||
|
"# 데이터 토큰화\n",
|
||
|
"filtered_encodings = tokenizer(list(filtered_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
|
||
|
"\n",
|
||
|
"# BERT 임베딩 계산 함수\n",
|
||
|
"def get_bert_embeddings(model, encodings, device):\n",
|
||
|
" model.eval()\n",
|
||
|
" with torch.no_grad():\n",
|
||
|
" input_ids = encodings['input_ids'].to(device)\n",
|
||
|
" attention_mask = encodings['attention_mask'].to(device)\n",
|
||
|
" outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
|
||
|
" return outputs.last_hidden_state.mean(dim=1).cpu().numpy() # 각 문장의 평균 임베딩 추출\n",
|
||
|
"\n",
|
||
|
"# BERT 모델로 임베딩 계산\n",
|
||
|
"bert_embeddings = get_bert_embeddings(model, filtered_encodings, device)\n",
|
||
|
"\n",
|
||
|
"# t-SNE 차원 축소\n",
|
||
|
"tsne = TSNE(n_components=2, random_state=42)\n",
|
||
|
"tsne_results = tsne.fit_transform(bert_embeddings)\n",
|
||
|
"\n",
|
||
|
"# 시각화를 위한 준비\n",
|
||
|
"unique_patterns = filtered_data['pattern'].unique()\n",
|
||
|
"color_map = plt.get_cmap('tab20', len(unique_patterns))\n",
|
||
|
"pattern_to_color = {pattern: idx for idx, pattern in enumerate(unique_patterns)}\n",
|
||
|
"\n",
|
||
|
"plt.figure(figsize=(14, 7))\n",
|
||
|
"\n",
|
||
|
"# 각 패턴별로 시각화\n",
|
||
|
"for pattern, color_idx in pattern_to_color.items():\n",
|
||
|
" pattern_indices = filtered_data['pattern'] == pattern\n",
|
||
|
" plt.scatter(tsne_results[pattern_indices, 0], tsne_results[pattern_indices, 1], \n",
|
||
|
" color=color_map(color_idx), marker='o', s=100, alpha=0.6, edgecolor='k', linewidth=1.2)\n",
|
||
|
"\n",
|
||
|
"# 그래프 설정\n",
|
||
|
"plt.xticks(fontsize=24)\n",
|
||
|
"plt.yticks(fontsize=24)\n",
|
||
|
"plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.6)\n",
|
||
|
"plt.tight_layout()\n",
|
||
|
"plt.show()\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "torch",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.14"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|