In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import os

average_accuracies = []

for n in range(1, 53):
    accuracies = []
    for group_number in range(1, 6):
        train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'
        test_path = f'../../translation/0.result/{group_number}/test_p.csv'

        if not os.path.exists(test_path):
            print(f"Test file for Group {group_number} does not exist. Skipping...")
            continue

        train_all_csv = pd.read_csv(train_all_path, low_memory=False)
        test_csv = pd.read_csv(test_path, low_memory=False)

        train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')
        test_csv['tag_description'] = test_csv['tag_description'].fillna('')

        test_csv['c_thing'], test_csv['c_property'], test_csv['c_score'], test_csv['c_duplicate'] = '', '', '', 0

        vectorizer = CountVectorizer(token_pattern=r'\S+', ngram_range=(1, 1))
        train_all_bow_matrix = vectorizer.fit_transform(train_all_csv['tag_description'])
        test_bow_matrix = vectorizer.transform(test_csv['tag_description'])

        knn = NearestNeighbors(n_neighbors=n, metric='euclidean', n_jobs=-1)
        knn.fit(train_all_bow_matrix)

        distances, indices = knn.kneighbors(test_bow_matrix)

        predicted_things = [train_all_csv.iloc[indices[i][0]]['thing'] for i in range(len(test_csv))]
        predicted_properties = [train_all_csv.iloc[indices[i][0]]['property'] for i in range(len(test_csv))]
        predicted_scores = [1 - distances[i][0] for i in range(len(test_csv))]

        test_csv['c_thing'], test_csv['c_property'], test_csv['c_score'] = predicted_things, predicted_properties, predicted_scores

        test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']
        test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']
        test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']

        mdm_true_count = len(test_csv[test_csv['MDM'] == True])
        accuracies.append((test_csv['ctp_correct'].sum() / mdm_true_count) * 100)

    average_accuracy = sum(accuracies) / len(accuracies)
    average_accuracies.append(average_accuracy)
    print(f"Average Accuracy (MDM=True) across all groups with n_neighbors={n}: {average_accuracy:.2f}%")

print("\nFinal Results:")
for n, avg_accuracy in zip(range(1, 53), average_accuracies):
    print(f"n_neighbors={n}, Average Accuracy: {avg_accuracy:.2f}%")


Average Accuracy (MDM=True) across all groups with n_neighbors=1: 84.43%


KeyboardInterrupt: 