hipom_data_mapping/post_process/tfidf_class/2a.classfiler_albert.ipynb

342 lines
19 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 completed. Loss: 5.564172267913818\n",
"Epoch 2 completed. Loss: 4.88321590423584\n",
"Epoch 3 completed. Loss: 3.5059947967529297\n",
"Epoch 4 completed. Loss: 3.18548583984375\n",
"Epoch 5 completed. Loss: 2.8037068843841553\n",
"Epoch 6 completed. Loss: 2.2223541736602783\n",
"Epoch 7 completed. Loss: 1.8634291887283325\n",
"Epoch 8 completed. Loss: 1.3251842260360718\n",
"Epoch 9 completed. Loss: 0.6083177328109741\n",
"Epoch 10 completed. Loss: 0.9423710703849792\n",
"Epoch 11 completed. Loss: 0.5799884796142578\n",
"Epoch 12 completed. Loss: 0.6948736310005188\n",
"Epoch 13 completed. Loss: 0.5177479386329651\n",
"Epoch 14 completed. Loss: 0.47343072295188904\n",
"Epoch 15 completed. Loss: 0.26853761076927185\n",
"Epoch 16 completed. Loss: 0.19693760573863983\n",
"Epoch 17 completed. Loss: 0.3199688494205475\n",
"Epoch 18 completed. Loss: 0.23672448098659515\n",
"Epoch 19 completed. Loss: 0.40235987305641174\n",
"Epoch 20 completed. Loss: 0.28102293610572815\n",
"Epoch 21 completed. Loss: 0.17575399577617645\n",
"Epoch 22 completed. Loss: 0.24652625620365143\n",
"Epoch 23 completed. Loss: 0.109055295586586\n",
"Epoch 24 completed. Loss: 0.19015412032604218\n",
"Epoch 25 completed. Loss: 0.10130400210618973\n",
"Epoch 26 completed. Loss: 0.14203257858753204\n",
"Epoch 27 completed. Loss: 0.1248723715543747\n",
"Epoch 28 completed. Loss: 0.05851107835769653\n",
"Epoch 29 completed. Loss: 0.041425254195928574\n",
"Epoch 30 completed. Loss: 0.0353962741792202\n",
"Epoch 31 completed. Loss: 0.04445452615618706\n",
"Epoch 32 completed. Loss: 0.026403019204735756\n",
"Epoch 33 completed. Loss: 0.028079884126782417\n",
"Epoch 34 completed. Loss: 0.059587348252534866\n",
"Epoch 35 completed. Loss: 0.02851276472210884\n",
"Epoch 36 completed. Loss: 0.09271513670682907\n",
"Epoch 37 completed. Loss: 0.06418397277593613\n",
"Epoch 38 completed. Loss: 0.03638231381773949\n",
"Epoch 39 completed. Loss: 0.022959664463996887\n",
"Epoch 40 completed. Loss: 0.044602662324905396\n",
"Epoch 41 completed. Loss: 0.03491249307990074\n",
"Epoch 42 completed. Loss: 0.039797600358724594\n",
"Epoch 43 completed. Loss: 0.04217083007097244\n",
"Epoch 44 completed. Loss: 0.4122176170349121\n",
"Epoch 45 completed. Loss: 0.1664775162935257\n",
"Epoch 46 completed. Loss: 0.04505300521850586\n",
"Epoch 47 completed. Loss: 0.14913827180862427\n",
"Epoch 48 completed. Loss: 0.016096509993076324\n",
"Epoch 49 completed. Loss: 0.05338064581155777\n",
"Epoch 50 completed. Loss: 0.10259533673524857\n",
"Epoch 51 completed. Loss: 0.008849691599607468\n",
"Epoch 52 completed. Loss: 0.028069255873560905\n",
"Epoch 53 completed. Loss: 0.008924427442252636\n",
"Epoch 54 completed. Loss: 0.015527592971920967\n",
"Epoch 55 completed. Loss: 0.009189464151859283\n",
"Epoch 56 completed. Loss: 0.007252057082951069\n",
"Epoch 57 completed. Loss: 0.01684846170246601\n",
"Epoch 58 completed. Loss: 0.010840333066880703\n",
"Epoch 59 completed. Loss: 0.05179211124777794\n",
"Epoch 60 completed. Loss: 0.007003726437687874\n",
"Epoch 61 completed. Loss: 0.00555015355348587\n",
"Epoch 62 completed. Loss: 0.0065276664681732655\n",
"Epoch 63 completed. Loss: 0.007942711934447289\n",
"Epoch 64 completed. Loss: 0.00675524678081274\n",
"Epoch 65 completed. Loss: 0.010359193198382854\n",
"Epoch 66 completed. Loss: 0.00662408908829093\n",
"Epoch 67 completed. Loss: 0.007672889623790979\n",
"Epoch 68 completed. Loss: 0.004661311395466328\n",
"Epoch 69 completed. Loss: 0.014480670914053917\n",
"Epoch 70 completed. Loss: 0.05042335391044617\n",
"Epoch 71 completed. Loss: 0.035947512835264206\n",
"Epoch 72 completed. Loss: 0.01213429868221283\n",
"Epoch 73 completed. Loss: 0.033572785556316376\n",
"Epoch 74 completed. Loss: 0.009208262898027897\n",
"Epoch 75 completed. Loss: 0.08961852639913559\n",
"Epoch 76 completed. Loss: 4.632999897003174\n",
"Epoch 77 completed. Loss: 5.957398891448975\n",
"Epoch 78 completed. Loss: 5.970841407775879\n",
"Epoch 79 completed. Loss: 5.905709266662598\n",
"Epoch 80 completed. Loss: 5.864459037780762\n",
"Validation Accuracy: 0.14%\n",
"Accuracy (MDM=True) for Group 4: 0.48%\n",
"Results saved to 0.class_document/albert/4/test_p_c.csv\n"
]
}
],
"source": [
"import pandas as pd\n",
"from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"import numpy as np\n",
"import torch.nn.functional as F\n",
"import os \n",
"\n",
"\n",
"group_number = 4\n",
"train_path = f'../../data_preprocess/dataset/{group_number}/train.csv'\n",
"valid_path = f'../../data_preprocess/dataset/{group_number}/valid.csv'\n",
"test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
"output_path = f'0.class_document/albert/{group_number}/test_p_c.csv'\n",
"\n",
"train_data = pd.read_csv(train_path)\n",
"valid_data = pd.read_csv(valid_path)\n",
"test_data = pd.read_csv(test_path)\n",
"\n",
"train_data['thing_property'] = train_data['thing'] + '_' + train_data['property']\n",
"valid_data['thing_property'] = valid_data['thing'] + '_' + valid_data['property']\n",
"test_data['thing_property'] = test_data['thing'] + '_' + test_data['property']\n",
"\n",
"tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')\n",
"label_encoder = LabelEncoder()\n",
"label_encoder.fit(train_data['thing_property'])\n",
"\n",
"valid_data['thing_property'] = valid_data['thing_property'].apply(\n",
" lambda x: x if x in label_encoder.classes_ else 'unknown_label')\n",
"test_data['thing_property'] = test_data['thing_property'].apply(\n",
" lambda x: x if x in label_encoder.classes_ else 'unknown_label')\n",
"\n",
"label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown_label')\n",
"\n",
"train_data['label'] = label_encoder.transform(train_data['thing_property'])\n",
"valid_data['label'] = label_encoder.transform(valid_data['thing_property'])\n",
"test_data['label'] = label_encoder.transform(test_data['thing_property'])\n",
"\n",
"train_texts, train_labels = train_data['tag_description'], train_data['label']\n",
"valid_texts, valid_labels = valid_data['tag_description'], valid_data['label']\n",
"\n",
"train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt')\n",
"valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True, return_tensors='pt')\n",
"\n",
"train_labels = torch.tensor(train_labels.values)\n",
"valid_labels = torch.tensor(valid_labels.values)\n",
"\n",
"class CustomDataset(Dataset):\n",
" def __init__(self, encodings, labels):\n",
" self.encodings = encodings\n",
" self.labels = labels\n",
"\n",
" def __getitem__(self, idx):\n",
" item = {key: val[idx] for key, val in self.encodings.items()}\n",
" item['labels'] = self.labels[idx]\n",
" return item\n",
"\n",
" def __len__(self):\n",
" return len(self.labels)\n",
"\n",
"train_dataset = CustomDataset(train_encodings, train_labels)\n",
"valid_dataset = CustomDataset(valid_encodings, valid_labels)\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
"valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)\n",
"\n",
"model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(train_data['thing_property'].unique()))\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
"\n",
"device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
"model.to(device)\n",
"\n",
"epochs = 80\n",
"for epoch in range(epochs):\n",
" model.train()\n",
" for batch in train_loader:\n",
" optimizer.zero_grad()\n",
" input_ids = batch['input_ids'].to(device)\n",
" attention_mask = batch['attention_mask'].to(device)\n",
" labels = batch['labels'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optimizer.step()\n",
" print(f\"Epoch {epoch + 1} completed. Loss: {loss.item()}\")\n",
"\n",
"model.eval()\n",
"correct, total = 0, 0\n",
"\n",
"with torch.no_grad():\n",
" for batch in valid_loader:\n",
" input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
" correct += (predictions == labels).sum().item()\n",
" total += labels.size(0)\n",
"\n",
"valid_accuracy = correct / total\n",
"print(f'Validation Accuracy: {valid_accuracy * 100:.2f}%')\n",
"\n",
"# Test 데이터 예측 및 c_thing, c_property 추가\n",
"test_encodings = tokenizer(list(test_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
"test_dataset = CustomDataset(test_encodings, torch.zeros(len(test_data))) # 레이블은 사용되지 않으므로 임시로 0을 사용\n",
"\n",
"test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n",
"\n",
"model.eval()\n",
"predicted_thing_properties = []\n",
"predicted_scores = []\n",
"\n",
"with torch.no_grad():\n",
" for batch in test_loader:\n",
" input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" softmax_scores = F.softmax(outputs.logits, dim=-1)\n",
" predictions = torch.argmax(softmax_scores, dim=-1)\n",
" predicted_thing_properties.extend(predictions.cpu().numpy())\n",
" predicted_scores.extend(softmax_scores[range(len(predictions)), predictions].cpu().numpy())\n",
"\n",
"# 예측된 thing_property를 레이블 인코더로 디코딩\n",
"predicted_thing_property_labels = label_encoder.inverse_transform(predicted_thing_properties)\n",
"\n",
"# thing_property를 thing과 property로 나눔\n",
"test_data['c_thing'] = [x.split('_')[0] for x in predicted_thing_property_labels]\n",
"test_data['c_property'] = [x.split('_')[1] for x in predicted_thing_property_labels]\n",
"test_data['c_score'] = predicted_scores\n",
"\n",
"test_data['cthing_correct'] = test_data['thing'] == test_data['c_thing']\n",
"test_data['cproperty_correct'] = test_data['property'] == test_data['c_property']\n",
"test_data['ctp_correct'] = test_data['cthing_correct'] & test_data['cproperty_correct']\n",
"\n",
"mdm_true_count = len(test_data[test_data['MDM'] == True])\n",
"accuracy = (test_data['ctp_correct'].sum() / mdm_true_count) * 100\n",
"\n",
"print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n",
"\n",
"os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
"\n",
"test_data.to_csv(output_path, index=False)\n",
"print(f'Results saved to {output_path}')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'AlbertForSequenceClassification' object has no attribute 'bert'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[29], line 20\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\u001b[38;5;241m.\u001b[39mlast_hidden_state\u001b[38;5;241m.\u001b[39mmean(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy() \u001b[38;5;66;03m# 각 문장의 평균 임베딩 추출\u001b[39;00m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# BERT 모델로 임베딩 계산\u001b[39;00m\n\u001b[0;32m---> 20\u001b[0m bert_embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mget_bert_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfiltered_encodings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;66;03m# t-SNE 차원 축소\u001b[39;00m\n\u001b[1;32m 23\u001b[0m tsne \u001b[38;5;241m=\u001b[39m TSNE(n_components\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\n",
"Cell \u001b[0;32mIn[29], line 16\u001b[0m, in \u001b[0;36mget_bert_embeddings\u001b[0;34m(model, encodings, device)\u001b[0m\n\u001b[1;32m 14\u001b[0m input_ids \u001b[38;5;241m=\u001b[39m encodings[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 15\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m encodings[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 16\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbert\u001b[49m(input_ids\u001b[38;5;241m=\u001b[39minput_ids, attention_mask\u001b[38;5;241m=\u001b[39mattention_mask)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\u001b[38;5;241m.\u001b[39mlast_hidden_state\u001b[38;5;241m.\u001b[39mmean(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n",
"File \u001b[0;32m~/anaconda3/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py:1709\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1707\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1708\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1709\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mAttributeError\u001b[0m: 'AlbertForSequenceClassification' object has no attribute 'bert'"
]
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# 'filtered_data_plot.csv' 읽기\n",
"filtered_data = pd.read_csv('filtered_data_plot.csv')\n",
"\n",
"# 데이터 토큰화\n",
"filtered_encodings = tokenizer(list(filtered_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
"\n",
"# BERT 임베딩 계산 함수\n",
"def get_bert_embeddings(model, encodings, device):\n",
" model.eval()\n",
" with torch.no_grad():\n",
" input_ids = encodings['input_ids'].to(device)\n",
" attention_mask = encodings['attention_mask'].to(device)\n",
" outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
" return outputs.last_hidden_state.mean(dim=1).cpu().numpy() # 각 문장의 평균 임베딩 추출\n",
"\n",
"# BERT 모델로 임베딩 계산\n",
"bert_embeddings = get_bert_embeddings(model, filtered_encodings, device)\n",
"\n",
"# t-SNE 차원 축소\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_results = tsne.fit_transform(bert_embeddings)\n",
"\n",
"# 시각화를 위한 준비\n",
"unique_patterns = filtered_data['pattern'].unique()\n",
"color_map = plt.get_cmap('tab20', len(unique_patterns))\n",
"pattern_to_color = {pattern: idx for idx, pattern in enumerate(unique_patterns)}\n",
"\n",
"plt.figure(figsize=(14, 7))\n",
"\n",
"# 각 패턴별로 시각화\n",
"for pattern, color_idx in pattern_to_color.items():\n",
" pattern_indices = filtered_data['pattern'] == pattern\n",
" plt.scatter(tsne_results[pattern_indices, 0], tsne_results[pattern_indices, 1], \n",
" color=color_map(color_idx), marker='o', s=100, alpha=0.6, edgecolor='k', linewidth=1.2)\n",
"\n",
"# 그래프 설정\n",
"plt.xticks(fontsize=24)\n",
"plt.yticks(fontsize=24)\n",
"plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.6)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}