148 lines
6.1 KiB
Plaintext
148 lines
6.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Running SVM with C=1000\n",
|
|
"Average Accuracy (MDM=True) across all groups with C=1000: 89.87%\n",
|
|
"Running SVM with C=10000\n",
|
|
"Average Accuracy (MDM=True) across all groups with C=10000: 89.33%\n",
|
|
"Running SVM with C=100000\n",
|
|
"Average Accuracy (MDM=True) across all groups with C=100000: 89.18%\n",
|
|
"Running SVM with C=1000000\n",
|
|
"Average Accuracy (MDM=True) across all groups with C=1000000: 89.18%\n",
|
|
"\n",
|
|
"Final Results for each C value:\n",
|
|
"C=1000, Average Accuracy: 89.87%\n",
|
|
"C=10000, Average Accuracy: 89.33%\n",
|
|
"C=100000, Average Accuracy: 89.18%\n",
|
|
"C=1000000, Average Accuracy: 89.18%\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"from sklearn.svm import SVC\n",
|
|
"import os\n",
|
|
"import numpy as np\n",
|
|
"from joblib import Parallel, delayed\n",
|
|
"\n",
|
|
"# Initialize variables to store overall accuracy results\n",
|
|
"average_accuracies = {}\n",
|
|
"\n",
|
|
"# Function to process each group (parallelized later)\n",
|
|
"def process_group(C_value, group_number):\n",
|
|
" train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n",
|
|
" test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
|
|
"\n",
|
|
" if not os.path.exists(test_path):\n",
|
|
" print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n",
|
|
" return None\n",
|
|
"\n",
|
|
" # Load the train_all and test CSVs\n",
|
|
" train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n",
|
|
" test_csv = pd.read_csv(test_path, low_memory=False)\n",
|
|
"\n",
|
|
" train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n",
|
|
" test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",
|
|
"\n",
|
|
" test_csv['c_thing'] = ''\n",
|
|
" test_csv['c_property'] = ''\n",
|
|
" test_csv['c_score'] = ''\n",
|
|
" test_csv['c_duplicate'] = 0\n",
|
|
"\n",
|
|
" combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n",
|
|
"\n",
|
|
" # TF-IDF 벡터화\n",
|
|
" vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n",
|
|
" vectorizer.fit(combined_tag_descriptions)\n",
|
|
"\n",
|
|
" train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray() # TF-IDF로 변환\n",
|
|
" test_tfidf_matrix = vectorizer.transform(test_csv['tag_description']).toarray()\n",
|
|
"\n",
|
|
" # SVM 모델 학습 및 예측\n",
|
|
" svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)\n",
|
|
" svm_model_property = SVC(kernel='linear', probability=True, C=C_value)\n",
|
|
"\n",
|
|
" # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습\n",
|
|
" svm_model_thing.fit(train_all_tfidf_matrix, train_all_csv['thing'])\n",
|
|
" svm_model_property.fit(train_all_tfidf_matrix, train_all_csv['property'])\n",
|
|
"\n",
|
|
" # 'thing' 및 'property' 예측\n",
|
|
" predicted_things = svm_model_thing.predict(test_tfidf_matrix)\n",
|
|
" predicted_properties = svm_model_property.predict(test_tfidf_matrix)\n",
|
|
" \n",
|
|
" predicted_scores_thing = svm_model_thing.predict_proba(test_tfidf_matrix)[:, 1] # 'thing'의 예측 확률 점수\n",
|
|
" predicted_scores_property = svm_model_property.predict_proba(test_tfidf_matrix)[:, 1] # 'property'의 예측 확률 점수\n",
|
|
"\n",
|
|
" predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2 # 평균 점수로 결합\n",
|
|
"\n",
|
|
" test_csv['c_thing'] = predicted_things\n",
|
|
" test_csv['c_property'] = predicted_properties\n",
|
|
" test_csv['c_score'] = predicted_scores\n",
|
|
"\n",
|
|
" test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",
|
|
" test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",
|
|
" test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",
|
|
"\n",
|
|
" mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",
|
|
" accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0\n",
|
|
" return accuracy\n",
|
|
"\n",
|
|
"# C 값들에 대해 실험할 값 설정 (log 스케일)\n",
|
|
"C_values = [0.1, 1, 10, 100]\n",
|
|
"C_values = [1000, 10000, 100000, 1000000]\n",
|
|
"# 각 C 값에 대해 실험\n",
|
|
"for C_value in C_values:\n",
|
|
" print(f\"Running SVM with C={C_value}\")\n",
|
|
" average_accuracies[C_value] = []\n",
|
|
"\n",
|
|
" # Parallel processing for groups\n",
|
|
" results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))\n",
|
|
"\n",
|
|
" # Filter out None results (in case of missing files)\n",
|
|
" accuracies = [result for result in results if result is not None]\n",
|
|
"\n",
|
|
" if accuracies:\n",
|
|
" average_accuracy = sum(accuracies) / len(accuracies)\n",
|
|
" average_accuracies[C_value].append(average_accuracy)\n",
|
|
" print(f\"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%\")\n",
|
|
"\n",
|
|
"# Print overall results for all C values\n",
|
|
"print(\"\\nFinal Results for each C value:\")\n",
|
|
"for C_value, accuracies in average_accuracies.items():\n",
|
|
" avg_acc = np.mean(accuracies)\n",
|
|
" print(f\"C={C_value}, Average Accuracy: {avg_acc:.2f}%\")\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "torch",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.14"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|