# %% import pandas as pd import os import glob from mapper import Mapper from preprocess import Abbreviator from deduplication import run_deduplication # global config BATCH_SIZE = 512 SHIPS_LIST = [1000,1001,1003,1004] # %% # START: we import the raw data csv and extract only a few ships from it to simulate incoming json data_path = 'raw_data.csv' full_df = pd.read_csv(data_path, skipinitialspace=True) # subset ships only to that found in SHIPS_LIST df = full_df[full_df['ships_idx'].isin(SHIPS_LIST)].reset_index(drop=True) # test parameters num_rows = len(df) - 1 df = df[:num_rows] print(len(df)) # pre-process data abbreviator = Abbreviator(df) df = abbreviator.run() # %% ########################################## # run mapping # checkpoint # Use glob to find matching paths checkpoint_path = 'models/mapping_model' mapper = Mapper(checkpoint_path) mapper.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128) thing_prediction_list, property_prediction_list = mapper.generate() # add labels too # thing_actual_list, property_actual_list = decode_preds(pred_labels) # Convert the list to a Pandas DataFrame df_out = pd.DataFrame({ 'p_thing': thing_prediction_list, 'p_property': property_prediction_list }) # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] df = pd.concat([df, df_out], axis=1) # %% #################################### # run de_duplication with thresholding data_path = "train_all.csv" train_df = pd.read_csv(data_path, skipinitialspace=True) train_df['mapping'] = train_df['thing'] + " " + train_df['property'] df = run_deduplication( test_df=df, train_df=train_df, batch_size=BATCH_SIZE, threshold=0.85, diagnostic=True) # %%