hipom_data_mapping/post_process/tfidf_class/2a.classifier_svm_tfidf.ipynb

148 lines
6.1 KiB
Plaintext
Raw Normal View History

2024-09-25 08:52:30 +09:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running SVM with C=1000\n",
"Average Accuracy (MDM=True) across all groups with C=1000: 89.87%\n",
"Running SVM with C=10000\n",
"Average Accuracy (MDM=True) across all groups with C=10000: 89.33%\n",
"Running SVM with C=100000\n",
"Average Accuracy (MDM=True) across all groups with C=100000: 89.18%\n",
"Running SVM with C=1000000\n",
"Average Accuracy (MDM=True) across all groups with C=1000000: 89.18%\n",
"\n",
"Final Results for each C value:\n",
"C=1000, Average Accuracy: 89.87%\n",
"C=10000, Average Accuracy: 89.33%\n",
"C=100000, Average Accuracy: 89.18%\n",
"C=1000000, Average Accuracy: 89.18%\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.svm import SVC\n",
"import os\n",
"import numpy as np\n",
"from joblib import Parallel, delayed\n",
"\n",
"# Initialize variables to store overall accuracy results\n",
"average_accuracies = {}\n",
"\n",
"# Function to process each group (parallelized later)\n",
"def process_group(C_value, group_number):\n",
" train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n",
" test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
"\n",
" if not os.path.exists(test_path):\n",
" print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n",
" return None\n",
"\n",
" # Load the train_all and test CSVs\n",
" train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n",
" test_csv = pd.read_csv(test_path, low_memory=False)\n",
"\n",
" train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n",
" test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",
"\n",
" test_csv['c_thing'] = ''\n",
" test_csv['c_property'] = ''\n",
" test_csv['c_score'] = ''\n",
" test_csv['c_duplicate'] = 0\n",
"\n",
" combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n",
"\n",
" # TF-IDF 벡터화\n",
" vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n",
" vectorizer.fit(combined_tag_descriptions)\n",
"\n",
" train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray() # TF-IDF로 변환\n",
" test_tfidf_matrix = vectorizer.transform(test_csv['tag_description']).toarray()\n",
"\n",
" # SVM 모델 학습 및 예측\n",
" svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)\n",
" svm_model_property = SVC(kernel='linear', probability=True, C=C_value)\n",
"\n",
" # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습\n",
" svm_model_thing.fit(train_all_tfidf_matrix, train_all_csv['thing'])\n",
" svm_model_property.fit(train_all_tfidf_matrix, train_all_csv['property'])\n",
"\n",
" # 'thing' 및 'property' 예측\n",
" predicted_things = svm_model_thing.predict(test_tfidf_matrix)\n",
" predicted_properties = svm_model_property.predict(test_tfidf_matrix)\n",
" \n",
" predicted_scores_thing = svm_model_thing.predict_proba(test_tfidf_matrix)[:, 1] # 'thing'의 예측 확률 점수\n",
" predicted_scores_property = svm_model_property.predict_proba(test_tfidf_matrix)[:, 1] # 'property'의 예측 확률 점수\n",
"\n",
" predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2 # 평균 점수로 결합\n",
"\n",
" test_csv['c_thing'] = predicted_things\n",
" test_csv['c_property'] = predicted_properties\n",
" test_csv['c_score'] = predicted_scores\n",
"\n",
" test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",
" test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",
" test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",
"\n",
" mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",
" accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0\n",
" return accuracy\n",
"\n",
"# C 값들에 대해 실험할 값 설정 (log 스케일)\n",
"C_values = [0.1, 1, 10, 100]\n",
"C_values = [1000, 10000, 100000, 1000000]\n",
"# 각 C 값에 대해 실험\n",
"for C_value in C_values:\n",
" print(f\"Running SVM with C={C_value}\")\n",
" average_accuracies[C_value] = []\n",
"\n",
" # Parallel processing for groups\n",
" results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))\n",
"\n",
" # Filter out None results (in case of missing files)\n",
" accuracies = [result for result in results if result is not None]\n",
"\n",
" if accuracies:\n",
" average_accuracy = sum(accuracies) / len(accuracies)\n",
" average_accuracies[C_value].append(average_accuracy)\n",
" print(f\"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%\")\n",
"\n",
"# Print overall results for all C values\n",
"print(\"\\nFinal Results for each C value:\")\n",
"for C_value, accuracies in average_accuracies.items():\n",
" avg_acc = np.mean(accuracies)\n",
" print(f\"C={C_value}, Average Accuracy: {avg_acc:.2f}%\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}