hipom_data_mapping/post_process/tfidf_class/2a.classifier_svm_tfidf.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running SVM with C=1000\n",
      "Average Accuracy (MDM=True) across all groups with C=1000: 89.87%\n",
      "Running SVM with C=10000\n",
      "Average Accuracy (MDM=True) across all groups with C=10000: 89.33%\n",
      "Running SVM with C=100000\n",
      "Average Accuracy (MDM=True) across all groups with C=100000: 89.18%\n",
      "Running SVM with C=1000000\n",
      "Average Accuracy (MDM=True) across all groups with C=1000000: 89.18%\n",
      "\n",
      "Final Results for each C value:\n",
      "C=1000, Average Accuracy: 89.87%\n",
      "C=10000, Average Accuracy: 89.33%\n",
      "C=100000, Average Accuracy: 89.18%\n",
      "C=1000000, Average Accuracy: 89.18%\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import SVC\n",
    "import os\n",
    "import numpy as np\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "# Initialize variables to store overall accuracy results\n",
    "average_accuracies = {}\n",
    "\n",
    "# Function to process each group (parallelized later)\n",
    "def process_group(C_value, group_number):\n",
    "    train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n",
    "    test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
    "\n",
    "    if not os.path.exists(test_path):\n",
    "        print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n",
    "        return None\n",
    "\n",
    "    # Load the train_all and test CSVs\n",
    "    train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n",
    "    test_csv = pd.read_csv(test_path, low_memory=False)\n",
    "\n",
    "    train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n",
    "    test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",
    "\n",
    "    test_csv['c_thing'] = ''\n",
    "    test_csv['c_property'] = ''\n",
    "    test_csv['c_score'] = ''\n",
    "    test_csv['c_duplicate'] = 0\n",
    "\n",
    "    combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n",
    "\n",
    "    # TF-IDF 벡터화\n",
    "    vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n",
    "    vectorizer.fit(combined_tag_descriptions)\n",
    "\n",
    "    train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray()  # TF-IDF로 변환\n",
    "    test_tfidf_matrix = vectorizer.transform(test_csv['tag_description']).toarray()\n",
    "\n",
    "    # SVM 모델 학습 및 예측\n",
    "    svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)\n",
    "    svm_model_property = SVC(kernel='linear', probability=True, C=C_value)\n",
    "\n",
    "    # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습\n",
    "    svm_model_thing.fit(train_all_tfidf_matrix, train_all_csv['thing'])\n",
    "    svm_model_property.fit(train_all_tfidf_matrix, train_all_csv['property'])\n",
    "\n",
    "    # 'thing' 및 'property' 예측\n",
    "    predicted_things = svm_model_thing.predict(test_tfidf_matrix)\n",
    "    predicted_properties = svm_model_property.predict(test_tfidf_matrix)\n",
    "    \n",
    "    predicted_scores_thing = svm_model_thing.predict_proba(test_tfidf_matrix)[:, 1]  # 'thing'의 예측 확률 점수\n",
    "    predicted_scores_property = svm_model_property.predict_proba(test_tfidf_matrix)[:, 1]  # 'property'의 예측 확률 점수\n",
    "\n",
    "    predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2  # 평균 점수로 결합\n",
    "\n",
    "    test_csv['c_thing'] = predicted_things\n",
    "    test_csv['c_property'] = predicted_properties\n",
    "    test_csv['c_score'] = predicted_scores\n",
    "\n",
    "    test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",
    "    test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",
    "    test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",
    "\n",
    "    mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",
    "    accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0\n",
    "    return accuracy\n",
    "\n",
    "# C 값들에 대해 실험할 값 설정 (log 스케일)\n",
    "C_values = [0.1, 1, 10, 100]\n",
    "C_values = [1000, 10000, 100000, 1000000]\n",
    "# 각 C 값에 대해 실험\n",
    "for C_value in C_values:\n",
    "    print(f\"Running SVM with C={C_value}\")\n",
    "    average_accuracies[C_value] = []\n",
    "\n",
    "    # Parallel processing for groups\n",
    "    results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))\n",
    "\n",
    "    # Filter out None results (in case of missing files)\n",
    "    accuracies = [result for result in results if result is not None]\n",
    "\n",
    "    if accuracies:\n",
    "        average_accuracy = sum(accuracies) / len(accuracies)\n",
    "        average_accuracies[C_value].append(average_accuracy)\n",
    "        print(f\"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%\")\n",
    "\n",
    "# Print overall results for all C values\n",
    "print(\"\\nFinal Results for each C value:\")\n",
    "for C_value, accuracies in average_accuracies.items():\n",
    "    avg_acc = np.mean(accuracies)\n",
    "    print(f\"C={C_value}, Average Accuracy: {avg_acc:.2f}%\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
[TASK] the entier paper work 2024-09-25 08:52:30 +09:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Running SVM with C=1000\n",`
			`"Average Accuracy (MDM=True) across all groups with C=1000: 89.87%\n",`
			`"Running SVM with C=10000\n",`
			`"Average Accuracy (MDM=True) across all groups with C=10000: 89.33%\n",`
			`"Running SVM with C=100000\n",`
			`"Average Accuracy (MDM=True) across all groups with C=100000: 89.18%\n",`
			`"Running SVM with C=1000000\n",`
			`"Average Accuracy (MDM=True) across all groups with C=1000000: 89.18%\n",`
			`"\n",`
			`"Final Results for each C value:\n",`
			`"C=1000, Average Accuracy: 89.87%\n",`
			`"C=10000, Average Accuracy: 89.33%\n",`
			`"C=100000, Average Accuracy: 89.18%\n",`
			`"C=1000000, Average Accuracy: 89.18%\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"import pandas as pd\n",`
			`"from sklearn.feature_extraction.text import TfidfVectorizer\n",`
			`"from sklearn.svm import SVC\n",`
			`"import os\n",`
			`"import numpy as np\n",`
			`"from joblib import Parallel, delayed\n",`
			`"\n",`
			`"# Initialize variables to store overall accuracy results\n",`
			`"average_accuracies = {}\n",`
			`"\n",`
			`"# Function to process each group (parallelized later)\n",`
			`"def process_group(C_value, group_number):\n",`
			`" train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n",`
			`" test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",`
			`"\n",`
			`" if not os.path.exists(test_path):\n",`
			`" print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n",`
			`" return None\n",`
			`"\n",`
			`" # Load the train_all and test CSVs\n",`
			`" train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n",`
			`" test_csv = pd.read_csv(test_path, low_memory=False)\n",`
			`"\n",`
			`" train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n",`
			`" test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",`
			`"\n",`
			`" test_csv['c_thing'] = ''\n",`
			`" test_csv['c_property'] = ''\n",`
			`" test_csv['c_score'] = ''\n",`
			`" test_csv['c_duplicate'] = 0\n",`
			`"\n",`
			`" combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n",`
			`"\n",`
			`" # TF-IDF 벡터화\n",`
			`" vectorizer = TfidfVectorizer(token_pattern=r'\\S+')\n",`
			`" vectorizer.fit(combined_tag_descriptions)\n",`
			`"\n",`
			`" train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray() # TF-IDF로 변환\n",`
			`" test_tfidf_matrix = vectorizer.transform(test_csv['tag_description']).toarray()\n",`
			`"\n",`
			`" # SVM 모델 학습 및 예측\n",`
			`" svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)\n",`
			`" svm_model_property = SVC(kernel='linear', probability=True, C=C_value)\n",`
			`"\n",`
			`" # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습\n",`
			`" svm_model_thing.fit(train_all_tfidf_matrix, train_all_csv['thing'])\n",`
			`" svm_model_property.fit(train_all_tfidf_matrix, train_all_csv['property'])\n",`
			`"\n",`
			`" # 'thing' 및 'property' 예측\n",`
			`" predicted_things = svm_model_thing.predict(test_tfidf_matrix)\n",`
			`" predicted_properties = svm_model_property.predict(test_tfidf_matrix)\n",`
			`" \n",`
			`" predicted_scores_thing = svm_model_thing.predict_proba(test_tfidf_matrix)[:, 1] # 'thing'의 예측 확률 점수\n",`
			`" predicted_scores_property = svm_model_property.predict_proba(test_tfidf_matrix)[:, 1] # 'property'의 예측 확률 점수\n",`
			`"\n",`
			`" predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2 # 평균 점수로 결합\n",`
			`"\n",`
			`" test_csv['c_thing'] = predicted_things\n",`
			`" test_csv['c_property'] = predicted_properties\n",`
			`" test_csv['c_score'] = predicted_scores\n",`
			`"\n",`
			`" test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",`
			`" test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",`
			`" test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",`
			`"\n",`
			`" mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",`
			`" accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0\n",`
			`" return accuracy\n",`
			`"\n",`
			`"# C 값들에 대해 실험할 값 설정 (log 스케일)\n",`
			`"C_values = [0.1, 1, 10, 100]\n",`
			`"C_values = [1000, 10000, 100000, 1000000]\n",`
			`"# 각 C 값에 대해 실험\n",`
			`"for C_value in C_values:\n",`
			`" print(f\"Running SVM with C={C_value}\")\n",`
			`" average_accuracies[C_value] = []\n",`
			`"\n",`
			`" # Parallel processing for groups\n",`
			`" results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))\n",`
			`"\n",`
			`" # Filter out None results (in case of missing files)\n",`
			`" accuracies = [result for result in results if result is not None]\n",`
			`"\n",`
			`" if accuracies:\n",`
			`" average_accuracy = sum(accuracies) / len(accuracies)\n",`
			`" average_accuracies[C_value].append(average_accuracy)\n",`
			`" print(f\"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%\")\n",`
			`"\n",`
			`"# Print overall results for all C values\n",`
			`"print(\"\\nFinal Results for each C value:\")\n",`
			`"for C_value, accuracies in average_accuracies.items():\n",`
			`" avg_acc = np.mean(accuracies)\n",`
			`" print(f\"C={C_value}, Average Accuracy: {avg_acc:.2f}%\")\n"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "torch",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.14"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`