{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running SVM with C=1000\n", "Average Accuracy (MDM=True) across all groups with C=1000: 89.87%\n", "Running SVM with C=10000\n", "Average Accuracy (MDM=True) across all groups with C=10000: 89.33%\n", "Running SVM with C=100000\n", "Average Accuracy (MDM=True) across all groups with C=100000: 89.18%\n", "Running SVM with C=1000000\n", "Average Accuracy (MDM=True) across all groups with C=1000000: 89.18%\n", "\n", "Final Results for each C value:\n", "C=1000, Average Accuracy: 89.87%\n", "C=10000, Average Accuracy: 89.33%\n", "C=100000, Average Accuracy: 89.18%\n", "C=1000000, Average Accuracy: 89.18%\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.svm import SVC\n", "import os\n", "import numpy as np\n", "from joblib import Parallel, delayed\n", "\n", "# Initialize variables to store overall accuracy results\n", "average_accuracies = {}\n", "\n", "# Function to process each group (parallelized later)\n", "def process_group(C_value, group_number):\n", " train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n", " test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n", "\n", " if not os.path.exists(test_path):\n", " print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n", " return None\n", "\n", " # Load the train_all and test CSVs\n", " train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n", " test_csv = pd.read_csv(test_path, low_memory=False)\n", "\n", " train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n", " test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n", "\n", " test_csv['c_thing'] = ''\n", " test_csv['c_property'] = ''\n", " test_csv['c_score'] = ''\n", " test_csv['c_duplicate'] = 0\n", "\n", " combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n", "\n", " # TF-IDF 벡터화\n", " vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n", " vectorizer.fit(combined_tag_descriptions)\n", "\n", " train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray() # TF-IDF로 변환\n", " test_tfidf_matrix = vectorizer.transform(test_csv['tag_description']).toarray()\n", "\n", " # SVM 모델 학습 및 예측\n", " svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)\n", " svm_model_property = SVC(kernel='linear', probability=True, C=C_value)\n", "\n", " # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습\n", " svm_model_thing.fit(train_all_tfidf_matrix, train_all_csv['thing'])\n", " svm_model_property.fit(train_all_tfidf_matrix, train_all_csv['property'])\n", "\n", " # 'thing' 및 'property' 예측\n", " predicted_things = svm_model_thing.predict(test_tfidf_matrix)\n", " predicted_properties = svm_model_property.predict(test_tfidf_matrix)\n", " \n", " predicted_scores_thing = svm_model_thing.predict_proba(test_tfidf_matrix)[:, 1] # 'thing'의 예측 확률 점수\n", " predicted_scores_property = svm_model_property.predict_proba(test_tfidf_matrix)[:, 1] # 'property'의 예측 확률 점수\n", "\n", " predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2 # 평균 점수로 결합\n", "\n", " test_csv['c_thing'] = predicted_things\n", " test_csv['c_property'] = predicted_properties\n", " test_csv['c_score'] = predicted_scores\n", "\n", " test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n", " test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n", " test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n", "\n", " mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n", " accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0\n", " return accuracy\n", "\n", "# C 값들에 대해 실험할 값 설정 (log 스케일)\n", "C_values = [0.1, 1, 10, 100]\n", "C_values = [1000, 10000, 100000, 1000000]\n", "# 각 C 값에 대해 실험\n", "for C_value in C_values:\n", " print(f\"Running SVM with C={C_value}\")\n", " average_accuracies[C_value] = []\n", "\n", " # Parallel processing for groups\n", " results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))\n", "\n", " # Filter out None results (in case of missing files)\n", " accuracies = [result for result in results if result is not None]\n", "\n", " if accuracies:\n", " average_accuracy = sum(accuracies) / len(accuracies)\n", " average_accuracies[C_value].append(average_accuracy)\n", " print(f\"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%\")\n", "\n", "# Print overall results for all C values\n", "print(\"\\nFinal Results for each C value:\")\n", "for C_value, accuracies in average_accuracies.items():\n", " avg_acc = np.mean(accuracies)\n", " print(f\"C={C_value}, Average Accuracy: {avg_acc:.2f}%\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "torch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }