hipom_data_mapping/post_process/tfidf_class/2a.classifier_bertdistil.ipynb

438 lines
21 KiB
Plaintext
Raw Normal View History

2024-09-25 08:52:30 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"/home/hwang/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 completed. Loss: 5.446770191192627\n",
"Validation Accuracy after Epoch 1: 18.30%\n",
"Epoch 2 completed. Loss: 3.8084073066711426\n",
"Validation Accuracy after Epoch 2: 40.87%\n",
"Epoch 3 completed. Loss: 3.0630860328674316\n",
"Validation Accuracy after Epoch 3: 65.36%\n",
"Epoch 4 completed. Loss: 1.5352345705032349\n",
"Validation Accuracy after Epoch 4: 73.26%\n",
"Epoch 5 completed. Loss: 0.8989766836166382\n",
"Validation Accuracy after Epoch 5: 78.01%\n",
"Epoch 6 completed. Loss: 0.9589817523956299\n",
"Validation Accuracy after Epoch 6: 81.65%\n",
"Epoch 7 completed. Loss: 0.7892795205116272\n",
"Validation Accuracy after Epoch 7: 83.85%\n",
"Epoch 8 completed. Loss: 0.5069147944450378\n",
"Validation Accuracy after Epoch 8: 86.97%\n",
"Epoch 9 completed. Loss: 0.524911642074585\n",
"Validation Accuracy after Epoch 9: 88.12%\n",
"Epoch 10 completed. Loss: 0.2070937305688858\n",
"Validation Accuracy after Epoch 10: 89.94%\n",
"Epoch 11 completed. Loss: 0.19738677144050598\n",
"Validation Accuracy after Epoch 11: 90.75%\n",
"Epoch 12 completed. Loss: 0.13339389860630035\n",
"Validation Accuracy after Epoch 12: 91.90%\n",
"Epoch 13 completed. Loss: 0.21022899448871613\n",
"Validation Accuracy after Epoch 13: 92.86%\n",
"Epoch 14 completed. Loss: 0.26752030849456787\n",
"Validation Accuracy after Epoch 14: 93.24%\n",
"Epoch 15 completed. Loss: 0.14866866171360016\n",
"Validation Accuracy after Epoch 15: 93.68%\n",
"Epoch 16 completed. Loss: 0.08989054709672928\n",
"Validation Accuracy after Epoch 16: 94.06%\n",
"Epoch 17 completed. Loss: 0.037873975932598114\n",
"Validation Accuracy after Epoch 17: 94.59%\n",
"Epoch 18 completed. Loss: 0.07367080450057983\n",
"Validation Accuracy after Epoch 18: 94.68%\n",
"Epoch 19 completed. Loss: 0.04101959988474846\n",
"Validation Accuracy after Epoch 19: 94.83%\n",
"Epoch 20 completed. Loss: 0.21339105069637299\n",
"Validation Accuracy after Epoch 20: 95.02%\n",
"Epoch 21 completed. Loss: 0.06965143978595734\n",
"Validation Accuracy after Epoch 21: 94.97%\n",
"Epoch 22 completed. Loss: 0.06043635308742523\n",
"Validation Accuracy after Epoch 22: 95.02%\n",
"Epoch 23 completed. Loss: 0.021217377856373787\n",
"Validation Accuracy after Epoch 23: 94.92%\n",
"Epoch 24 completed. Loss: 0.037467293441295624\n",
"Validation Accuracy after Epoch 24: 95.02%\n",
"Epoch 25 completed. Loss: 0.016836028546094894\n",
"Validation Accuracy after Epoch 25: 95.02%\n",
"Epoch 26 completed. Loss: 0.028664518147706985\n",
"Validation Accuracy after Epoch 26: 95.11%\n",
"Epoch 27 completed. Loss: 0.011028420180082321\n",
"Validation Accuracy after Epoch 27: 95.16%\n",
"Epoch 28 completed. Loss: 0.04282907024025917\n",
"Validation Accuracy after Epoch 28: 95.16%\n",
"Epoch 29 completed. Loss: 0.00940023921430111\n",
"Validation Accuracy after Epoch 29: 95.35%\n",
"Epoch 30 completed. Loss: 0.13019809126853943\n",
"Validation Accuracy after Epoch 30: 95.35%\n",
"Epoch 31 completed. Loss: 0.01270432397723198\n",
"Validation Accuracy after Epoch 31: 95.11%\n",
"Epoch 32 completed. Loss: 0.012832771986722946\n",
"Validation Accuracy after Epoch 32: 95.16%\n",
"Epoch 33 completed. Loss: 0.012174545787274837\n",
"Validation Accuracy after Epoch 33: 95.16%\n",
"Epoch 34 completed. Loss: 0.02090534381568432\n",
"Validation Accuracy after Epoch 34: 95.02%\n",
"Epoch 35 completed. Loss: 0.017653826624155045\n",
"Validation Accuracy after Epoch 35: 94.49%\n",
"Epoch 36 completed. Loss: 0.02190311811864376\n",
"Validation Accuracy after Epoch 36: 94.59%\n",
"Epoch 37 completed. Loss: 0.048320867121219635\n",
"Validation Accuracy after Epoch 37: 94.68%\n",
"Epoch 38 completed. Loss: 0.015598177909851074\n",
"Validation Accuracy after Epoch 38: 95.30%\n",
"Epoch 39 completed. Loss: 0.009368035942316055\n",
"Validation Accuracy after Epoch 39: 94.83%\n",
"Epoch 40 completed. Loss: 0.009023590944707394\n",
"Validation Accuracy after Epoch 40: 95.02%\n",
"Epoch 41 completed. Loss: 0.040157418698072433\n",
"Validation Accuracy after Epoch 41: 95.11%\n",
"Epoch 42 completed. Loss: 0.11878462135791779\n",
"Validation Accuracy after Epoch 42: 95.06%\n",
"Epoch 43 completed. Loss: 0.021250683814287186\n",
"Validation Accuracy after Epoch 43: 95.16%\n",
"Epoch 44 completed. Loss: 0.0023518940433859825\n",
"Validation Accuracy after Epoch 44: 95.16%\n",
"Epoch 45 completed. Loss: 0.00595875782892108\n",
"Validation Accuracy after Epoch 45: 95.16%\n",
"Epoch 46 completed. Loss: 0.0025296895764768124\n",
"Validation Accuracy after Epoch 46: 94.97%\n",
"Epoch 47 completed. Loss: 0.0753568485379219\n",
"Validation Accuracy after Epoch 47: 95.26%\n",
"Epoch 48 completed. Loss: 0.002112493384629488\n",
"Validation Accuracy after Epoch 48: 95.06%\n",
"Epoch 49 completed. Loss: 0.09600060433149338\n",
"Validation Accuracy after Epoch 49: 95.06%\n",
"Epoch 50 completed. Loss: 0.002454130444675684\n",
"Validation Accuracy after Epoch 50: 95.21%\n",
"Accuracy (MDM=True) for Group 5: 91.98%\n",
"Results saved to 0.class_document/distilbert/5/test_p_c.csv\n"
]
}
],
"source": [
"import pandas as pd\n",
"from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"import numpy as np\n",
"import torch.nn.functional as F\n",
"import os \n",
"\n",
"group_number = 5\n",
"train_path = f'../../data_preprocess/dataset/{group_number}/train.csv'\n",
"valid_path = f'../../data_preprocess/dataset/{group_number}/valid.csv'\n",
"test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
"output_path = f'0.class_document/distilbert/{group_number}/test_p_c.csv'\n",
"\n",
"train_data = pd.read_csv(train_path)\n",
"valid_data = pd.read_csv(valid_path)\n",
"test_data = pd.read_csv(test_path)\n",
"\n",
"train_data['thing_property'] = train_data['thing'] + '_' + train_data['property']\n",
"valid_data['thing_property'] = valid_data['thing'] + '_' + valid_data['property']\n",
"test_data['thing_property'] = test_data['thing'] + '_' + test_data['property']\n",
"\n",
"tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
"label_encoder = LabelEncoder()\n",
"label_encoder.fit(train_data['thing_property'])\n",
"\n",
"valid_data['thing_property'] = valid_data['thing_property'].apply(\n",
" lambda x: x if x in label_encoder.classes_ else 'unknown_label')\n",
"test_data['thing_property'] = test_data['thing_property'].apply(\n",
" lambda x: x if x in label_encoder.classes_ else 'unknown_label')\n",
"\n",
"label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown_label')\n",
"\n",
"train_data['label'] = label_encoder.transform(train_data['thing_property'])\n",
"valid_data['label'] = label_encoder.transform(valid_data['thing_property'])\n",
"test_data['label'] = label_encoder.transform(test_data['thing_property'])\n",
"\n",
"train_texts, train_labels = train_data['tag_description'], train_data['label']\n",
"valid_texts, valid_labels = valid_data['tag_description'], valid_data['label']\n",
"\n",
"train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt')\n",
"valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True, return_tensors='pt')\n",
"\n",
"train_labels = torch.tensor(train_labels.values)\n",
"valid_labels = torch.tensor(valid_labels.values)\n",
"\n",
"class CustomDataset(Dataset):\n",
" def __init__(self, encodings, labels):\n",
" self.encodings = encodings\n",
" self.labels = labels\n",
"\n",
" def __getitem__(self, idx):\n",
" item = {key: val[idx] for key, val in self.encodings.items()}\n",
" item['labels'] = self.labels[idx]\n",
" return item\n",
"\n",
" def __len__(self):\n",
" return len(self.labels)\n",
"\n",
"train_dataset = CustomDataset(train_encodings, train_labels)\n",
"valid_dataset = CustomDataset(valid_encodings, valid_labels)\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
"valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)\n",
"\n",
"model = DistilBertForSequenceClassification.from_pretrained(\n",
" 'distilbert-base-uncased', \n",
" num_labels=len(train_data['thing_property'].unique())\n",
")\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
"\n",
"device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')\n",
"model.to(device)\n",
"\n",
"epochs = 50\n",
"for epoch in range(epochs):\n",
" model.train()\n",
" for batch in train_loader:\n",
" optimizer.zero_grad()\n",
" input_ids = batch['input_ids'].to(device)\n",
" attention_mask = batch['attention_mask'].to(device)\n",
" labels = batch['labels'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optimizer.step()\n",
" print(f\"Epoch {epoch + 1} completed. Loss: {loss.item()}\")\n",
"\n",
" # 검증 루프\n",
" model.eval()\n",
" correct, total = 0, 0\n",
"\n",
" with torch.no_grad():\n",
" for batch in valid_loader:\n",
" input_ids = batch['input_ids'].to(device)\n",
" attention_mask = batch['attention_mask'].to(device)\n",
" labels = batch['labels'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
" correct += (predictions == labels).sum().item()\n",
" total += labels.size(0)\n",
"\n",
" valid_accuracy = correct / total\n",
" print(f'Validation Accuracy after Epoch {epoch + 1}: {valid_accuracy * 100:.2f}%')\n",
"\n",
"# Test 데이터 예측 및 c_thing, c_property 추가\n",
"test_encodings = tokenizer(list(test_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
"test_dataset = CustomDataset(test_encodings, torch.zeros(len(test_data))) # 레이블은 사용되지 않으므로 임시로 0을 사용\n",
"\n",
"test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n",
"\n",
"model.eval()\n",
"predicted_thing_properties = []\n",
"predicted_scores = []\n",
"\n",
"with torch.no_grad():\n",
" for batch in test_loader:\n",
" input_ids = batch['input_ids'].to(device)\n",
" attention_mask = batch['attention_mask'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" softmax_scores = F.softmax(outputs.logits, dim=-1)\n",
" predictions = torch.argmax(softmax_scores, dim=-1)\n",
" predicted_thing_properties.extend(predictions.cpu().numpy())\n",
" predicted_scores.extend(softmax_scores[range(len(predictions)), predictions].cpu().numpy())\n",
"\n",
"# 예측된 thing_property를 레이블 인코더로 디코딩\n",
"predicted_thing_property_labels = label_encoder.inverse_transform(predicted_thing_properties)\n",
"\n",
"# thing_property를 thing과 property로 나눔\n",
"test_data['c_thing'] = [x.split('_')[0] for x in predicted_thing_property_labels]\n",
"test_data['c_property'] = [x.split('_')[1] for x in predicted_thing_property_labels]\n",
"test_data['c_score'] = predicted_scores\n",
"\n",
"test_data['cthing_correct'] = test_data['thing'] == test_data['c_thing']\n",
"test_data['cproperty_correct'] = test_data['property'] == test_data['c_property']\n",
"test_data['ctp_correct'] = test_data['cthing_correct'] & test_data['cproperty_correct']\n",
"\n",
"mdm_true_count = len(test_data[test_data['MDM'] == True])\n",
"accuracy = (test_data['ctp_correct'].sum() / mdm_true_count) * 100\n",
"\n",
"print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n",
"\n",
"# 결과를 저장하기 전에 폴더가 존재하는지 확인하고, 없으면 생성\n",
"os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
"\n",
"test_data.to_csv(output_path, index=False)\n",
"print(f'Results saved to {output_path}')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validation Accuracy: 95.21%\n",
"Accuracy (MDM=True) for Group 5: 91.98%\n",
"Results saved to 0.class_document/distilbert/5/test_p_c.csv\n"
]
}
],
"source": [
"# 검증 루프\n",
"model.eval()\n",
"correct, total = 0, 0\n",
"\n",
"with torch.no_grad():\n",
" for batch in valid_loader:\n",
" input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
" correct += (predictions == labels).sum().item()\n",
" total += labels.size(0)\n",
"\n",
"valid_accuracy = correct / total\n",
"print(f'Validation Accuracy: {valid_accuracy * 100:.2f}%')\n",
"\n",
"# Test 데이터 예측 및 c_thing, c_property 추가\n",
"test_encodings = tokenizer(list(test_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
"test_dataset = CustomDataset(test_encodings, torch.zeros(len(test_data))) # 레이블은 사용되지 않으므로 임시로 0을 사용\n",
"\n",
"test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n",
"\n",
"model.eval()\n",
"predicted_thing_properties = []\n",
"predicted_scores = []\n",
"\n",
"with torch.no_grad():\n",
" for batch in test_loader:\n",
" input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" softmax_scores = F.softmax(outputs.logits, dim=-1)\n",
" predictions = torch.argmax(softmax_scores, dim=-1)\n",
" predicted_thing_properties.extend(predictions.cpu().numpy())\n",
" predicted_scores.extend(softmax_scores[range(len(predictions)), predictions].cpu().numpy())\n",
"\n",
"# 예측된 thing_property를 레이블 인코더로 디코딩\n",
"predicted_thing_property_labels = label_encoder.inverse_transform(predicted_thing_properties)\n",
"\n",
"# thing_property를 thing과 property로 나눔\n",
"test_data['c_thing'] = [x.split('_')[0] for x in predicted_thing_property_labels]\n",
"test_data['c_property'] = [x.split('_')[1] for x in predicted_thing_property_labels]\n",
"test_data['c_score'] = predicted_scores\n",
"\n",
"test_data['cthing_correct'] = test_data['thing'] == test_data['c_thing']\n",
"test_data['cproperty_correct'] = test_data['property'] == test_data['c_property']\n",
"test_data['ctp_correct'] = test_data['cthing_correct'] & test_data['cproperty_correct']\n",
"\n",
"mdm_true_count = len(test_data[test_data['MDM'] == True])\n",
"accuracy = (test_data['ctp_correct'].sum() / mdm_true_count) * 100\n",
"\n",
"print(f\"Accuracy (MDM=True) for Group {group_number}: {accuracy:.2f}%\")\n",
"\n",
"os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
"\n",
"test_data.to_csv(output_path, index=False)\n",
"print(f'Results saved to {output_path}')\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# 'filtered_data_plot.csv' 읽기\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m filtered_data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfiltered_data_plot.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# 데이터 토큰화\u001b[39;00m\n\u001b[1;32m 8\u001b[0m filtered_encodings \u001b[38;5;241m=\u001b[39m tokenizer(\u001b[38;5;28mlist\u001b[39m(filtered_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtag_description\u001b[39m\u001b[38;5;124m'\u001b[39m]), truncation\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# 'filtered_data_plot.csv' 읽기\n",
"filtered_data = pd.read_csv('filtered_data_plot.csv')\n",
"\n",
"# 데이터 토큰화\n",
"filtered_encodings = tokenizer(list(filtered_data['tag_description']), truncation=True, padding=True, return_tensors='pt')\n",
"\n",
"# BERT 임베딩 계산 함수\n",
"def get_bert_embeddings(model, encodings, device):\n",
" model.eval()\n",
" with torch.no_grad():\n",
" input_ids = encodings['input_ids'].to(device)\n",
" attention_mask = encodings['attention_mask'].to(device)\n",
" outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
" return outputs.last_hidden_state.mean(dim=1).cpu().numpy() # 각 문장의 평균 임베딩 추출\n",
"\n",
"# BERT 모델로 임베딩 계산\n",
"bert_embeddings = get_bert_embeddings(model, filtered_encodings, device)\n",
"\n",
"# t-SNE 차원 축소\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_results = tsne.fit_transform(bert_embeddings)\n",
"\n",
"# 시각화를 위한 준비\n",
"unique_patterns = filtered_data['pattern'].unique()\n",
"color_map = plt.get_cmap('tab20', len(unique_patterns))\n",
"pattern_to_color = {pattern: idx for idx, pattern in enumerate(unique_patterns)}\n",
"\n",
"plt.figure(figsize=(14, 7))\n",
"\n",
"# 각 패턴별로 시각화\n",
"for pattern, color_idx in pattern_to_color.items():\n",
" pattern_indices = filtered_data['pattern'] == pattern\n",
" plt.scatter(tsne_results[pattern_indices, 0], tsne_results[pattern_indices, 1], \n",
" color=color_map(color_idx), marker='o', s=100, alpha=0.6, edgecolor='k', linewidth=1.2)\n",
"\n",
"# 그래프 설정\n",
"plt.xticks(fontsize=24)\n",
"plt.yticks(fontsize=24)\n",
"plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.6)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}