149 lines
6.0 KiB
Plaintext
149 lines
6.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"test_p_c.csv saved for Group 1 at 0.class_document/knn_tfidf/1/test_p_c.csv\n",
|
|
"test_p_c.csv saved for Group 2 at 0.class_document/knn_tfidf/2/test_p_c.csv\n",
|
|
"test_p_c.csv saved for Group 3 at 0.class_document/knn_tfidf/3/test_p_c.csv\n",
|
|
"test_p_c.csv saved for Group 4 at 0.class_document/knn_tfidf/4/test_p_c.csv\n",
|
|
"test_p_c.csv saved for Group 5 at 0.class_document/knn_tfidf/5/test_p_c.csv\n",
|
|
"Average Accuracy (MDM=True) across all groups with n_neighbors=5: 84.37%\n",
|
|
"\n",
|
|
"Final Results:\n",
|
|
"n_neighbors=1, Average Accuracy: 84.37%\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"from sklearn.neighbors import NearestNeighbors\n",
|
|
"import os\n",
|
|
"\n",
|
|
"# Initialize variables to store overall accuracy results\n",
|
|
"average_accuracies = []\n",
|
|
"\n",
|
|
"# Loop through n_neighbors values from 1 to 52\n",
|
|
"for n in range(5, 6):\n",
|
|
" accuracies = [] # Store accuracy for each group\n",
|
|
"\n",
|
|
" # Loop through group numbers from 1 to 5\n",
|
|
" for group_number in range(1, 6):\n",
|
|
" train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n",
|
|
" test_path = f'../../translation/0.result/{group_number}/test_p.csv'\n",
|
|
"\n",
|
|
" if not os.path.exists(test_path):\n",
|
|
" print(f\"Test file for Group {group_number} does not exist. Skipping...\")\n",
|
|
" continue\n",
|
|
"\n",
|
|
" # Load the train_all and test CSVs\n",
|
|
" train_all_csv = pd.read_csv(train_all_path, low_memory=False)\n",
|
|
" test_csv = pd.read_csv(test_path, low_memory=False)\n",
|
|
"\n",
|
|
" train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')\n",
|
|
" test_csv['tag_description'] = test_csv['tag_description'].fillna('')\n",
|
|
"\n",
|
|
" test_csv['c_thing'] = ''\n",
|
|
" test_csv['c_property'] = ''\n",
|
|
" test_csv['c_score'] = ''\n",
|
|
" test_csv['c_duplicate'] = 0\n",
|
|
"\n",
|
|
" combined_tag_descriptions = train_all_csv['tag_description'].tolist()\n",
|
|
"\n",
|
|
" # TfidfVectorizer 사용\n",
|
|
" vectorizer = TfidfVectorizer(token_pattern=r'\\S+', ngram_range=(1, 1), use_idf=True)\n",
|
|
" vectorizer.fit(combined_tag_descriptions)\n",
|
|
"\n",
|
|
" train_all_tfidf_matrix = vectorizer.transform(train_all_csv['tag_description'])\n",
|
|
" test_tfidf_matrix = vectorizer.transform(test_csv['tag_description'])\n",
|
|
"\n",
|
|
" # KNN에서 유클리디안 거리를 이용\n",
|
|
" knn = NearestNeighbors(n_neighbors=n, metric='cosine', n_jobs=-1)\n",
|
|
" knn.fit(train_all_tfidf_matrix)\n",
|
|
"\n",
|
|
" distances, indices = knn.kneighbors(test_tfidf_matrix)\n",
|
|
"\n",
|
|
" predicted_things = []\n",
|
|
" predicted_properties = []\n",
|
|
" predicted_scores = []\n",
|
|
"\n",
|
|
" for i in range(len(test_csv)):\n",
|
|
" neighbor_index = indices[i][0]\n",
|
|
" distance = distances[i][0]\n",
|
|
"\n",
|
|
" neighbor_thing = train_all_csv.iloc[neighbor_index]['thing']\n",
|
|
" neighbor_property = train_all_csv.iloc[neighbor_index]['property']\n",
|
|
"\n",
|
|
" predicted_things.append(neighbor_thing)\n",
|
|
" predicted_properties.append(neighbor_property)\n",
|
|
"\n",
|
|
" # 거리 기반으로 유사도 점수 계산\n",
|
|
" predicted_score = 1 - distance\n",
|
|
" predicted_scores.append(predicted_score)\n",
|
|
"\n",
|
|
" test_csv['c_thing'] = predicted_things\n",
|
|
" test_csv['c_property'] = predicted_properties\n",
|
|
" test_csv['c_score'] = predicted_scores\n",
|
|
"\n",
|
|
" test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']\n",
|
|
" test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']\n",
|
|
" test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']\n",
|
|
"\n",
|
|
" mdm_true_count = len(test_csv[test_csv['MDM'] == True])\n",
|
|
" accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100\n",
|
|
" accuracies.append(accuracy)\n",
|
|
"\n",
|
|
" # n_neighbors가 5일 때, test_csv를 지정된 경로에 저장\n",
|
|
" if n == 5:\n",
|
|
" output_path = f'0.class_document/knn_tfidf/{group_number}/test_p_c.csv'\n",
|
|
" os.makedirs(os.path.dirname(output_path), exist_ok=True) # 폴더가 없을 경우 생성\n",
|
|
" test_csv.to_csv(output_path, index=False)\n",
|
|
" print(f\"test_p_c.csv saved for Group {group_number} at {output_path}\")\n",
|
|
"\n",
|
|
" # Calculate the average accuracy for the current n_neighbors value\n",
|
|
" average_accuracy = sum(accuracies) / len(accuracies)\n",
|
|
" average_accuracies.append(average_accuracy)\n",
|
|
" print(f\"Average Accuracy (MDM=True) across all groups with n_neighbors={n}: {average_accuracy:.2f}%\")\n",
|
|
"\n",
|
|
"# Print overall results for all n_neighbors values\n",
|
|
"print(\"\\nFinal Results:\")\n",
|
|
"for n, avg_accuracy in zip(range(1, 53), average_accuracies):\n",
|
|
" print(f\"n_neighbors={n}, Average Accuracy: {avg_accuracy:.2f}%\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "torch",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.14"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|