{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The file with the updated p_dup and p_map columns has been saved: 0.class_document/knn_tfidf/1/test_p_c_r.csv\n" ] } ], "source": [ "import pandas as pd\n", "\n", "group_number = 1\n", "method_name='knn_tfidf'\n", "# Read the test file\n", "test_path = f'0.class_document/{method_name}/{group_number}/test_p_c_r.csv'\n", "df = pd.read_csv(test_path)\n", "\n", "# Concatenate p_thing and p_property into p_tp in the test data\n", "df['p_tp'] = df['p_thing'] + \" \" + df['p_property']\n", "\n", "# Read the train_all file\n", "train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'\n", "train_all_df = pd.read_csv(train_all_path)\n", "\n", "# Concatenate thing and property into tp in the train_all data\n", "train_all_df['tp'] = train_all_df['thing'] + \" \" + train_all_df['property']\n", "\n", "# Initialize the p_map column in the test data\n", "df['p_map'] = 0\n", "\n", "# Group by ships_idx and then group p_tp within each ships_idx group\n", "grouped = df.groupby('ships_idx')['p_tp']\n", "\n", "# Iterate through each ships_idx group\n", "for ships_idx, group in grouped:\n", " # Count the occurrences of each p_tp within the test group\n", " p_tp_counts = group.value_counts()\n", " \n", " # Assign the count as an integer to p_dup for rows with the corresponding p_tp within the group\n", " for p_tp, count in p_tp_counts.items():\n", " # Update p_dup\n", " df.loc[(df['ships_idx'] == ships_idx) & (df['p_tp'] == p_tp), 'p_dup'] = int(count)\n", " \n", " # Calculate p_map by counting matching tp in train_all_df\n", " p_map_count = train_all_df['tp'].eq(p_tp).sum()\n", " df.loc[(df['ships_idx'] == ships_idx) & (df['p_tp'] == p_tp), 'p_map'] = int(p_map_count)\n", "\n", "# Save the modified DataFrame\n", "output_path = f'0.class_document/{method_name}/{group_number}/test_p_c_r.csv'\n", "df.to_csv(output_path, index=False, encoding='utf-8-sig')\n", "\n", "print(\"The file with the updated p_dup and p_map columns has been saved:\", output_path)\n" ] } ], "metadata": { "kernelspec": { "display_name": "torch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }