hipom_data_mapping/post_process/selection/predict.py

import pandas as pd
import os
import glob

# directory for checkpoints
checkpoint_directory =  '../../train/baseline'

def select(fold):
    # import test data
    data_path = f"../../train/mapping/exports/result_group_{fold}.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)

    # get target data
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # processing to help with selection later
    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']


    ##########################################
    # Process the dataframe for selection

    # we start to cull predictions from here
    data_master_path = "../../data_import/exports/data_model_master_export.csv"
    df_master = pd.read_csv(data_master_path, skipinitialspace=True)
    data_mapping = df
    # Generate patterns    
    data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)
    data_mapping['property_pattern'] = data_mapping['property'].str.replace(r'\d', '#', regex=True)
    data_mapping['pattern'] = data_mapping['thing_pattern'] + " " + data_mapping['property_pattern']
    df_master['master_pattern'] = df_master['thing'] + " " + df_master['property']    
    # Create a set of unique patterns from master for fast lookup    
    master_patterns = set(df_master['master_pattern'])
    # thing_patterns = set(df_master['thing'])
    # Check each pattern in data_mapping if it exists in df_master and assign the "MDM" field    
    data_mapping['MDM'] = data_mapping['pattern'].apply(lambda x: x in master_patterns)    

    # check if prediction is in MDM
    data_mapping['p_thing_pattern'] = data_mapping['p_thing'].str.replace(r'\d', '#', regex=True)
    data_mapping['p_property_pattern'] = data_mapping['p_property'].str.replace(r'\d', '#', regex=True)
    data_mapping['p_pattern'] = data_mapping['p_thing_pattern'] + " " + data_mapping['p_property_pattern']
    data_mapping['p_MDM'] = data_mapping['p_pattern'].apply(lambda x: x in master_patterns)    

    df = data_mapping

    # we can save the t5 generation output here
    # df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")


    # condition1 = df['MDM']
    # condition2 = df['p_MDM']

    # condition_correct_thing = df['p_thing'] == df['thing']
    # condition_correct_property = df['p_property'] == df['property']
    # match = sum(condition1 & condition2)
    # fn = sum(condition1 & ~condition2)
    # prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)

    # print("mdm match predicted mdm: ", match)  # 56 - false negative
    # print("mdm but not predicted mdm: ", fn)  # 56 - false negative
    # print("total mdm: ", sum(condition1))  # 2113
    # print("total predicted mdm: ", sum(condition2))  # 6896 - a lot of false positives
    # print("correct mdm predicted", prediction_mdm_correct)


    # selection
    ###########################################
    # we now have to perform selection
    # we restrict to predictions of a class of a ship
    # then perform similarity selection with in-distribution data
    # the magic is in performing per-class selection, not global
    # import importlib
    import selection
    # importlib.reload(selection)
    selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)

    ##########################################
    # run inference
    # checkpoint
    # Use glob to find matching paths
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
    tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)


    # write output to file output.txt
    with open("output.txt", "a") as f:
        print(80 * '*', file=f)
        print(f'Statistics for fold {fold}', file=f)
        print(f"tp: {tp}", file=f)
        print(f"tn: {tn}", file=f)
        print(f"fp: {fp}", file=f)
        print(f"fn: {fn}", file=f)
        print(f"fold: {fold}", file=f)
        print("accuracy: ", (tp+tn)/(tp+tn+fp+fn), file=f)
        print("f1_score: ", (2*tp)/((2*tp) + fp + fn), file=f)
        print("precision: ", (tp)/(tp+fp), file=f)
        print("recall: ", (tp)/(tp+fn), file=f)

###########################################  
# Execute for all folds

# reset file before writing to it
with open("output.txt", "w") as f:
    print('', file=f)

for fold in [1,2,3,4,5]:
    select(fold)
Feat: added train and test directories 2024-10-31 15:58:20 +09:00			`import pandas as pd`
			`import os`
			`import glob`

			`# directory for checkpoints`
			`checkpoint_directory = '../../train/baseline'`

Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00			`def select(fold):`
Feat: added train and test directories 2024-10-31 15:58:20 +09:00			`# import test data`
Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00			`data_path = f"../../train/mapping/exports/result_group_{fold}.csv"`
Feat: added train and test directories 2024-10-31 15:58:20 +09:00			`df = pd.read_csv(data_path, skipinitialspace=True)`

			`# get target data`
			`data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"`
			`train_df = pd.read_csv(data_path, skipinitialspace=True)`
			`# processing to help with selection later`
			`train_df['thing_property'] = train_df['thing'] + " " + train_df['property']`


			`##########################################`
			`# Process the dataframe for selection`

			`# we start to cull predictions from here`
Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00			`data_master_path = "../../data_import/exports/data_model_master_export.csv"`
Feat: added train and test directories 2024-10-31 15:58:20 +09:00			`df_master = pd.read_csv(data_master_path, skipinitialspace=True)`
			`data_mapping = df`
			`# Generate patterns`
			`data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)`
			`data_mapping['property_pattern'] = data_mapping['property'].str.replace(r'\d', '#', regex=True)`
			`data_mapping['pattern'] = data_mapping['thing_pattern'] + " " + data_mapping['property_pattern']`
			`df_master['master_pattern'] = df_master['thing'] + " " + df_master['property']`
			`# Create a set of unique patterns from master for fast lookup`
			`master_patterns = set(df_master['master_pattern'])`
			`# thing_patterns = set(df_master['thing'])`
			`# Check each pattern in data_mapping if it exists in df_master and assign the "MDM" field`
			`data_mapping['MDM'] = data_mapping['pattern'].apply(lambda x: x in master_patterns)`

			`# check if prediction is in MDM`
			`data_mapping['p_thing_pattern'] = data_mapping['p_thing'].str.replace(r'\d', '#', regex=True)`
			`data_mapping['p_property_pattern'] = data_mapping['p_property'].str.replace(r'\d', '#', regex=True)`
			`data_mapping['p_pattern'] = data_mapping['p_thing_pattern'] + " " + data_mapping['p_property_pattern']`
			`data_mapping['p_MDM'] = data_mapping['p_pattern'].apply(lambda x: x in master_patterns)`

			`df = data_mapping`

			`# we can save the t5 generation output here`
			`# df.to_parquet(f"exports/fold_{fold}/t5_output.parquet")`



Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00			`# condition1 = df['MDM']`
			`# condition2 = df['p_MDM']`
Feat: added train and test directories 2024-10-31 15:58:20 +09:00
Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00			`# condition_correct_thing = df['p_thing'] == df['thing']`
			`# condition_correct_property = df['p_property'] == df['property']`
			`# match = sum(condition1 & condition2)`
			`# fn = sum(condition1 & ~condition2)`
			`# prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & condition1)`
Feat: added train and test directories 2024-10-31 15:58:20 +09:00
			`# print("mdm match predicted mdm: ", match) # 56 - false negative`
			`# print("mdm but not predicted mdm: ", fn) # 56 - false negative`
			`# print("total mdm: ", sum(condition1)) # 2113`
			`# print("total predicted mdm: ", sum(condition2)) # 6896 - a lot of false positives`
			`# print("correct mdm predicted", prediction_mdm_correct)`


			`# selection`
			`###########################################`
			`# we now have to perform selection`
			`# we restrict to predictions of a class of a ship`
			`# then perform similarity selection with in-distribution data`
			`# the magic is in performing per-class selection, not global`
			`# import importlib`
			`import selection`
			`# importlib.reload(selection)`
			`selector = selection.Selector(input_df=df, reference_df=train_df, fold=fold)`
Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00
			`##########################################`
			`# run inference`
			`# checkpoint`
			`# Use glob to find matching paths`
			`directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')`
			`# Use glob to find matching paths`
			`# path is usually checkpoint_fold_1/checkpoint-<step number>`
			`# we are guaranteed to save only 1 checkpoint from training`
			`pattern = 'checkpoint-*'`
			`checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]`
Feat: added train and test directories 2024-10-31 15:58:20 +09:00			`tp, tn, fp, fn = selector.run_selection(checkpoint_path=checkpoint_path)`


			`# write output to file output.txt`
			`with open("output.txt", "a") as f:`
			`print(80 * '*', file=f)`
			`print(f'Statistics for fold {fold}', file=f)`
			`print(f"tp: {tp}", file=f)`
			`print(f"tn: {tn}", file=f)`
			`print(f"fp: {fp}", file=f)`
			`print(f"fn: {fn}", file=f)`
			`print(f"fold: {fold}", file=f)`
			`print("accuracy: ", (tp+tn)/(tp+tn+fp+fn), file=f)`
			`print("f1_score: ", (2tp)/((2tp) + fp + fn), file=f)`
			`print("precision: ", (tp)/(tp+fp), file=f)`
			`print("recall: ", (tp)/(tp+fn), file=f)`

			`###########################################`
			`# Execute for all folds`

			`# reset file before writing to it`
			`with open("output.txt", "w") as f:`
			`print('', file=f)`

			`for fold in [1,2,3,4,5]:`
Chore: moved selection to post_process, mapping to test 2024-10-31 16:35:28 +09:00			`select(fold)`