In [6]:
import pandas as pd

def evaluate_performance(test_csv):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for index, row in test_csv.iterrows():
        if row['s_correct'] and row['MDM']:
            TP += 1
        elif row['s_thing'] == '' and not row['MDM']:
            TN += 1
        elif (row['s_thing'] != '' and not row['MDM']):
            FP += 1
        elif row['s_thing'] == '' and row['MDM'] or (row['s_thing'] != '' and not row['s_correct'] and row['MDM']):
            FN += 1

    total = TP + TN + FP + FN

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    accuracy = (TP + TN) / total if total > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return TP, TN, FP, FN, precision, recall, accuracy, f1_score

# Lists to store performance metrics for all folds
all_precisions = []
all_recalls = []
all_accuracies = []
all_f1_scores = []

# Perform evaluation for group 1 to 5
for group_number in range(1, 6):
    test_s_path = f'../post_process/0.result/{group_number}/test_s.csv'
    test_s_csv = pd.read_csv(test_s_path, low_memory=False)
    test_s_csv.fillna('', inplace=True)

    tp_s_results = evaluate_performance(test_s_csv)

    print(f"Performance for group {group_number} (test_s.csv):")
    print(f"TP: {tp_s_results[0]}, TN: {tp_s_results[1]}, FP: {tp_s_results[2]}, FN: {tp_s_results[3]}")
    print(f"Precision: {tp_s_results[4]:.4f}, Recall: {tp_s_results[5]:.4f}, Accuracy: {tp_s_results[6]:.4f}, F1-Score: {tp_s_results[7]:.4f}")
    print("-" * 50)

    all_precisions.append(tp_s_results[4])
    all_recalls.append(tp_s_results[5])
    all_accuracies.append(tp_s_results[6])
    all_f1_scores.append(tp_s_results[7])

# Calculate and print the averages across all groups
average_precision = sum(all_precisions) / len(all_precisions)
average_recall = sum(all_recalls) / len(all_recalls)
average_accuracy = sum(all_accuracies) / len(all_accuracies)
average_f1_score = sum(all_f1_scores) / len(all_f1_scores)

print("Average performance across all groups:")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Average F1-Score: {average_f1_score:.4f}")


Performance for group 1 (test_s.csv):
TP: 1794, TN: 9954, FP: 1005, FN: 319
Precision: 0.6409, Recall: 0.8490, Accuracy: 0.8987, F1-Score: 0.7305
--------------------------------------------------
Performance for group 2 (test_s.csv):
TP: 1824, TN: 7716, FP: 866, FN: 316
Precision: 0.6781, Recall: 0.8523, Accuracy: 0.8898, F1-Score: 0.7553
--------------------------------------------------
Performance for group 3 (test_s.csv):
TP: 1804, TN: 6866, FP: 996, FN: 188
Precision: 0.6443, Recall: 0.9056, Accuracy: 0.8798, F1-Score: 0.7529
--------------------------------------------------
Performance for group 4 (test_s.csv):
TP: 1916, TN: 12360, FP: 989, FN: 186
Precision: 0.6596, Recall: 0.9115, Accuracy: 0.9240, F1-Score: 0.7653
--------------------------------------------------


  test_s_csv.fillna('', inplace=True)


Performance for group 5 (test_s.csv):
TP: 1910, TN: 9800, FP: 955, FN: 273
Precision: 0.6667, Recall: 0.8749, Accuracy: 0.9051, F1-Score: 0.7567
--------------------------------------------------
Average performance across all groups:
Average Precision: 0.6579
Average Recall: 0.8787
Average Accuracy: 0.8995
Average F1-Score: 0.7521
