{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running SVM with C=1000\n",
      "Average Accuracy (MDM=True) across all groups with C=1000: 89.87%\n",
      "Running SVM with C=10000\n",
      "Average Accuracy (MDM=True) across all groups with C=10000: 89.33%\n",
      "Running SVM with C=100000\n",
      "Average Accuracy (MDM=True) across all groups with C=100000: 89.18%\n",
      "Running SVM with C=1000000\n",
      "Average Accuracy (MDM=True) across all groups with C=1000000: 89.18%\n",
      "\n",
      "Final Results for each C value:\n",
      "C=1000, Average Accuracy: 89.87%\n",
      "C=10000, Average Accuracy: 89.33%\n",
      "C=100000, Average Accuracy: 89.18%\n",
      "C=1000000, Average Accuracy: 89.18%\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import SVC\n",
    "import os\n",
    "import numpy as np\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "# Initialize variables to store overall accuracy results\n",
    "average_accuracies = {}\n",
    "\n",
    "# Function to process each group (parallelized later)\n",
    "def process_group(C_value, group_number):\n",
    "    train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n",
    "    test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
    "\n",
    "    if not os.path.exists(test_path):\n",
    "        print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n",
    "        return None\n",
    "\n",
    "    # Load the train_all and test CSVs\n",
    "    train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n",
    "    test_csv = pd.read_csv(test_path, low_memory=False)\n",
    "\n",
    "    train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n",
    "    test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",
    "\n",
    "    test_csv['c_thing'] = ''\n",
    "    test_csv['c_property'] = ''\n",
    "    test_csv['c_score'] = ''\n",
    "    test_csv['c_duplicate'] = 0\n",
    "\n",
    "    combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n",
    "\n",
    "    # TF-IDF 벡터화\n",
    "    vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n",
    "    vectorizer.fit(combined_tag_descriptions)\n",
    "\n",
    "    train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray()  # TF-IDF로 변환\n",
    "    test_tfidf_matrix = vectorizer.transform(test_csv['tag_description']).toarray()\n",
    "\n",
    "    # SVM 모델 학습 및 예측\n",
    "    svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)\n",
    "    svm_model_property = SVC(kernel='linear', probability=True, C=C_value)\n",
    "\n",
    "    # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습\n",
    "    svm_model_thing.fit(train_all_tfidf_matrix, train_all_csv['thing'])\n",
    "    svm_model_property.fit(train_all_tfidf_matrix, train_all_csv['property'])\n",
    "\n",
    "    # 'thing' 및 'property' 예측\n",
    "    predicted_things = svm_model_thing.predict(test_tfidf_matrix)\n",
    "    predicted_properties = svm_model_property.predict(test_tfidf_matrix)\n",
    "    \n",
    "    predicted_scores_thing = svm_model_thing.predict_proba(test_tfidf_matrix)[:, 1]  # 'thing'의 예측 확률 점수\n",
    "    predicted_scores_property = svm_model_property.predict_proba(test_tfidf_matrix)[:, 1]  # 'property'의 예측 확률 점수\n",
    "\n",
    "    predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2  # 평균 점수로 결합\n",
    "\n",
    "    test_csv['c_thing'] = predicted_things\n",
    "    test_csv['c_property'] = predicted_properties\n",
    "    test_csv['c_score'] = predicted_scores\n",
    "\n",
    "    test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",
    "    test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",
    "    test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",
    "\n",
    "    mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",
    "    accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0\n",
    "    return accuracy\n",
    "\n",
    "# C 값들에 대해 실험할 값 설정 (log 스케일)\n",
    "C_values = [0.1, 1, 10, 100]\n",
    "C_values = [1000, 10000, 100000, 1000000]\n",
    "# 각 C 값에 대해 실험\n",
    "for C_value in C_values:\n",
    "    print(f\"Running SVM with C={C_value}\")\n",
    "    average_accuracies[C_value] = []\n",
    "\n",
    "    # Parallel processing for groups\n",
    "    results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))\n",
    "\n",
    "    # Filter out None results (in case of missing files)\n",
    "    accuracies = [result for result in results if result is not None]\n",
    "\n",
    "    if accuracies:\n",
    "        average_accuracy = sum(accuracies) / len(accuracies)\n",
    "        average_accuracies[C_value].append(average_accuracy)\n",
    "        print(f\"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%\")\n",
    "\n",
    "# Print overall results for all C values\n",
    "print(\"\\nFinal Results for each C value:\")\n",
    "for C_value, accuracies in average_accuracies.items():\n",
    "    avg_acc = np.mean(accuracies)\n",
    "    print(f\"C={C_value}, Average Accuracy: {avg_acc:.2f}%\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}