In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re

model = "distilbert"

for group_number in range(1, 6):  # Group 1 to 5
    print(f"Processing group {group_number}...")

    # Load test CSV for the current group
    test_path = f'0.class_document/{model}/t5-tiny/{group_number}/test_p_c.csv'
    test_csv = pd.read_csv(test_path, low_memory=False)

    # Initialize counters
    update_count = 0
    duplicate_count = 0
    non_duplicate_count = 0

    # Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met
    for index, row in test_csv.iterrows():
        if not row['p_correct'] and row['ctp_correct']:
            update_count += 1  # Increment the counter

            # Check for duplicates within the same ships_idx
            same_idx_rows = test_csv[(test_csv['ships_idx'] == row['ships_idx']) &
                                     (test_csv['p_thing'] == row['c_thing']) &
                                     (test_csv['p_property'] == row['c_property'])]

            if len(same_idx_rows) > 0:
                duplicate_count += 1
            else:
                non_duplicate_count += 1

    # Print the results for the current group
    print(f"Total updates where p_correct is False and ctp_correct is True (group {group_number}): {update_count}")
    print(f"Number of rows with duplicates in the same ships_idx (group {group_number}): {duplicate_count}")
    print(f"Number of rows without duplicates in the same ships_idx (group {group_number}): {non_duplicate_count}")

    # Initialize a list to hold rows that meet the conditions for refinement
    refine_rows = []
    update_count = 0

    # Assign c_thing, c_property to p_thing, p_property and set p_MDM to True if conditions are met
    for index, row in test_csv.iterrows():
        if (not row['p_MDM'] and row['c_score'] >= 0.91 and 
            (row['p_thing'] != row['c_thing'] or row['p_property'] != row['c_property'])):

            test_csv.at[index, 'p_thing'] = row['c_thing']
            test_csv.at[index, 'p_property'] = row['c_property']
            test_csv.at[index, 'p_MDM'] = True

            updated_p_thing = test_csv.at[index, 'p_thing']
            updated_p_property = test_csv.at[index, 'p_property']
            p_pattern = re.sub(r'\d', '#', updated_p_thing) + " " + re.sub(r'\d', '#', updated_p_property)
            test_csv.at[index, 'p_pattern'] = p_pattern
            update_count += 1  # Increment the counter
            refine_rows.append(row)  # Add the row to the refine list

    # Convert the list of refine rows into a DataFrame
    refine_df = pd.DataFrame(refine_rows)

    # Save the refine DataFrame to a CSV file for the current group
    refine_output_path = f'0.class_document/{model}/{group_number}/refine.csv'
    refine_df.to_csv(refine_output_path, index=False, encoding='utf-8-sig')

    # Print the number of updates made
    print(f"Number of updates made (group {group_number}): {update_count}")

    # Save the updated test CSV for the current group
    output_file_path = f'0.class_document/{model}/{group_number}/test_p_c_r.csv'
    test_csv.to_csv(output_file_path, index=False, encoding='utf-8-sig')

    print(f"Updated test CSV saved to {output_file_path}")
    print(f"Refine CSV saved to {refine_output_path}")


Processing group 1...
Total updates where p_correct is False and ctp_correct is True (group 1): 55
Number of rows with duplicates in the same ships_idx (group 1): 34
Number of rows without duplicates in the same ships_idx (group 1): 21
Number of updates made (group 1): 427
Updated test CSV saved to 0.class_document/distilbert/1/test_p_c_r.csv
Refine CSV saved to 0.class_document/distilbert/1/refine.csv
Processing group 2...
Total updates where p_correct is False and ctp_correct is True (group 2): 63
Number of rows with duplicates in the same ships_idx (group 2): 21
Number of rows without duplicates in the same ships_idx (group 2): 42
Number of updates made (group 2): 225
Updated test CSV saved to 0.class_document/distilbert/2/test_p_c_r.csv
Refine CSV saved to 0.class_document/distilbert/2/refine.csv
Processing group 3...
Total updates where p_correct is False and ctp_correct is True (group 3): 32
Number of rows with duplicates in the same ships_idx (group 3): 10
Number of rows without