Feat: added more classification and mapping variations

Feat: added grid-search for threshold in similarity-classifier Feat: added more abbreviation rules
2024-11-25 18:15:28 +09:00 · 2024-11-25 18:15:28 +09:00 · ff6e11a3c0
parent 1f3970459f
commit ff6e11a3c0
43 changed files with 2905 additions and 50558 deletions
--- a/analysis/bert/find_closest_mapping_complete.py
+++ b/analysis/bert/find_closest_mapping_complete.py
@ -0,0 +1,293 @@
 # %%
 import pandas as pd
 from utils import Retriever, cosine_similarity_chunked
 import os
 import glob
 import numpy as np
 # %%
 fold = 5
 data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 # subset to mdm
 df = df[df['MDM']]
 # create new fields 'mapping' and 'p_mapping'
 # these are analogous to 'pattern', where we combine 'thing' and 'property' without replacing the numbers
 df['mapping'] = df['thing'] + ' ' + df['property']
 df['p_mapping'] = df['p_thing'] + ' ' + df['p_property']
 thing_condition = df['p_thing'] == df['thing']
 error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
 property_condition = df['p_property'] == df['property']
 error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
 correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
 test_df = df
 # %%
 print(len(error_thing_df))
 print(len(error_property_df))
 # %%
 # thing_df.to_html('thing_errors.html')
 # property_df.to_html('property_errors.html')
 ##########################################
 # what we need now is understand why the model is making these mispredictions
 # import train data and test data
 # %%
 class Embedder():
    input_df: pd.DataFrame
    fold: int
    def __init__(self, input_df):
        self.input_df = input_df
    def make_embedding(self, checkpoint_path):
        def generate_input_list(df):
            input_list = []
            for _, row in df.iterrows():
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
                element = f"{desc}{unit}"
                input_list.append(element)
            return input_list
        # prepare reference embed
        train_data = list(generate_input_list(self.input_df))
        # Define the directory and the pattern
        retriever_train = Retriever(train_data, checkpoint_path)
        retriever_train.make_embedding(batch_size=64)
        return retriever_train.embeddings.to('cpu')
 # %%
 data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 train_df['mapping'] = train_df['thing'] + ' ' + train_df['property']
 checkpoint_directory = "../../train/classification_bert_complete_desc_unit"
 directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
 # Use glob to find matching paths
 # path is usually checkpoint_fold_1/checkpoint-<step number>
 # we are guaranteed to save only 1 checkpoint from training
 pattern = 'checkpoint-*'
 checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
 train_embedder = Embedder(input_df=train_df)
 train_embeds = train_embedder.make_embedding(checkpoint_path)
 test_embedder = Embedder(input_df=test_df)
 test_embeds = test_embedder.make_embedding(checkpoint_path)
 # %%
 # test embeds are inputs since we are looking back at train data
 cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
 # %%
 # the following function takes in a full cos_sim_matrix
 # condition_source: boolean selectors of the source embedding
 # condition_target: boolean selectors of the target embedding
 def find_closest(cos_sim_matrix, condition_source, condition_target):
    # subset_matrix = cos_sim_matrix[condition_source]
    # except we are subsetting 2D matrix (row, column)
    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
    # we select top k here
    # Get the indices of the top 5 maximum values along axis 1
    top_k = 3
    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
    # note that top_k_indices is a nested list because of the 2d nature of the matrix
    # the result is flipped
    top_k_indices[0] = top_k_indices[0][::-1]
    # Get the values of the top 5 maximum scores
    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
    return top_k_indices, top_k_values
 ####################################################
 # special find-back code
 # %%
 def find_back_element_with_print(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
    test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
    predicted_test_data = test_df[test_df.index == select_idx]['p_mapping']
    # predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
    predicted_test_data = predicted_test_data.to_list()[0]
    print("*" * 80)
    print("idx:", select_idx)
    print("train desc", training_desc_list)
    print("train thing+property", training_data_pattern_list)
    print("test desc", test_desc_list)
    print("test thing+property", test_data_pattern_list)
    print("predicted thing+property", predicted_test_data)
    print("ships idx", test_ship_id)
    print("score:", top_k_values[0])
    test_pattern = test_data_pattern_list[0]
    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
    if sum(find_back_list) > 0:
        return True
    else:
        return False
 # %%
 def find_back_element(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
    # print(training_data_pattern_list)
    # print(test_data_pattern_list)
    test_pattern = test_data_pattern_list[0]
    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
    if sum(find_back_list) > 0:
        return True
    else:
        return False
 # %%
 # for error thing
 pattern_in_train = []
 for select_idx in error_thing_df.index:
    result = find_back_element_with_print(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 sum(pattern_in_train)/len(pattern_in_train)
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
    result = find_back_element_with_print(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 sum(pattern_in_train)/len(pattern_in_train)
 ####################################################
 # %%
 # make function to compute similarity of closest retrieved result
 def compute_similarity(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    return np.mean(top_k_values[0])
 # %%
 def print_summary(similarity_scores):
    # Convert list to numpy array for additional stats
    np_array = np.array(similarity_scores)
    # Get stats
    mean_value = np.mean(np_array)
    percentiles = np.percentile(np_array, [25, 50, 75])  # 25th, 50th, and 75th percentiles
    # Display numpy results
    print("Mean:", mean_value)
    print("25th, 50th, 75th Percentiles:", percentiles)
 # %%
 ##########################################
 # Analyze the degree of similarity differences between correct and incorrect results
 # %%
 # compute similarity scores for all values in error_thing_df
 similarity_thing_scores = []
 for idx in error_thing_df.index:
    similarity_thing_scores.append(compute_similarity(idx))
 print_summary(similarity_thing_scores)
 # %%
 similarity_property_scores = []
 for idx in error_property_df.index:
    similarity_property_scores.append(compute_similarity(idx))
 print_summary(similarity_property_scores)
 # %%
 similarity_correct_scores = []
 for idx in correct_df.index:
    similarity_correct_scores.append(compute_similarity(idx))
 print_summary(similarity_correct_scores)
 # %%
 import matplotlib.pyplot as plt
 # Sample data
 list1 = similarity_thing_scores
 list2 = similarity_property_scores
 list3 = similarity_correct_scores
 # Plot histograms
 bins = 50
 plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
 plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
 plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
 # Labels and legend
 plt.xlabel('Value')
 plt.ylabel('Frequency')
 plt.legend(loc='upper right')
 plt.title('Histograms of Three Lists')
 # Show plot
 plt.show()
 # %%
--- a/analysis/bert/realistic_labels.py
+++ b/analysis/bert/realistic_labels.py
@ -0,0 +1,320 @@
 # %%
 import pandas as pd
 from utils import Retriever, cosine_similarity_chunked
 import os
 import glob
 import numpy as np
 # %%
 fold = 5
 data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 # subset to mdm
 df = df[df['MDM']]
 # create new fields 'mapping' and 'p_mapping'
 # these are analogous to 'pattern', where we combine 'thing' and 'property' without replacing the numbers
 df['mapping'] = df['thing'] + ' ' + df['property']
 df['p_mapping'] = df['p_thing'] + ' ' + df['p_property']
 thing_condition = df['p_thing'] == df['thing']
 error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
 property_condition = df['p_property'] == df['property']
 error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
 correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
 test_df = df
 # %%
 print(len(error_thing_df))
 print(len(error_property_df))
 # %%
 # thing_df.to_html('thing_errors.html')
 # property_df.to_html('property_errors.html')
 ##########################################
 # what we need now is understand why the model is making these mispredictions
 # import train data and test data
 # %%
 class Embedder():
    input_df: pd.DataFrame
    fold: int
    def __init__(self, input_df):
        self.input_df = input_df
    def make_embedding(self, checkpoint_path):
        def generate_input_list(df):
            input_list = []
            for _, row in df.iterrows():
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
                element = f"{desc}{unit}"
                input_list.append(element)
            return input_list
        # prepare reference embed
        train_data = list(generate_input_list(self.input_df))
        # Define the directory and the pattern
        retriever_train = Retriever(train_data, checkpoint_path)
        retriever_train.make_embedding(batch_size=64)
        return retriever_train.embeddings.to('cpu')
 # %%
 data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 train_df['mapping'] = train_df['thing'] + ' ' + train_df['property']
 checkpoint_directory = "../../train/classification_bert_complete_desc_unit"
 directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
 # Use glob to find matching paths
 # path is usually checkpoint_fold_1/checkpoint-<step number>
 # we are guaranteed to save only 1 checkpoint from training
 pattern = 'checkpoint-*'
 checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
 train_embedder = Embedder(input_df=train_df)
 train_embeds = train_embedder.make_embedding(checkpoint_path)
 test_embedder = Embedder(input_df=test_df)
 test_embeds = test_embedder.make_embedding(checkpoint_path)
 # %%
 # test embeds are inputs since we are looking back at train data
 cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
 # %%
 # the following function takes in a full cos_sim_matrix
 # condition_source: boolean selectors of the source embedding
 # condition_target: boolean selectors of the target embedding
 def find_closest(cos_sim_matrix, condition_source, condition_target):
    # subset_matrix = cos_sim_matrix[condition_source]
    # except we are subsetting 2D matrix (row, column)
    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
    # we select top k here
    # Get the indices of the top 5 maximum values along axis 1
    top_k = 10
    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
    # note that top_k_indices is a nested list because of the 2d nature of the matrix
    # the result is flipped
    top_k_indices[0] = top_k_indices[0][::-1]
    # Get the values of the top 5 maximum scores
    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
    return top_k_indices, top_k_values
 ####################################################
 # special find-back code
 # %%
 def find_back_element_with_print(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
    test_ship_id = test_df[test_df.index == select_idx]['ships_idx'].to_list()[0]
    predicted_test_data = test_df[test_df.index == select_idx]['p_mapping']
    # predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
    predicted_test_data = predicted_test_data.to_list()[0]
    print("*" * 80)
    print("idx:", select_idx)
    print("train desc", training_desc_list)
    print("train thing+property", training_data_pattern_list)
    print("test desc", test_desc_list)
    print("test thing+property", test_data_pattern_list)
    print("predicted thing+property", predicted_test_data)
    print("ships idx", test_ship_id)
    print("score:", top_k_values[0])
    test_pattern = test_data_pattern_list[0]
    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
    if sum(find_back_list) > 0:
        return True
    else:
        return False
 # %%
 def find_back_element(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
    # print(training_data_pattern_list)
    # print(test_data_pattern_list)
    test_pattern = test_data_pattern_list[0]
    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
    if sum(find_back_list) > 0:
        return True
    else:
        return False
 # %%
 # for entire test df
 pattern_in_train = []
 for select_idx in test_df.index:
    result = find_back_element(select_idx)
    # print("status:", result)
    pattern_in_train.append(result)
 sum(pattern_in_train)/len(pattern_in_train)
 # %%
 # within pattern in train, what is the "correct" rate?
 sub_df = test_df[pattern_in_train]
 result = sub_df['mapping'] == sub_df['p_mapping']
 # this is the realistic label result
 print(sum(result)/len(result)) # this is the more realistic result
 # %%
 # for pattern not in training data, what is the "correct" rate?
 # within pattern in train, what is the "correct" rate?
 sub_df = test_df[~np.array(pattern_in_train)]
 result = sub_df['mapping'] == sub_df['p_mapping']
 print(sum(result)/len(result))
 # %%
 # for error thing
 pattern_in_train = []
 for select_idx in error_thing_df.index:
    result = find_back_element_with_print(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 sum(pattern_in_train)/len(pattern_in_train)
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
    result = find_back_element_with_print(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 sum(pattern_in_train)/len(pattern_in_train)
 ####################################################
 # %%
 # make function to compute similarity of closest retrieved result
 def compute_similarity(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    return np.mean(top_k_values[0])
 # %%
 def print_summary(similarity_scores):
    # Convert list to numpy array for additional stats
    np_array = np.array(similarity_scores)
    # Get stats
    mean_value = np.mean(np_array)
    percentiles = np.percentile(np_array, [25, 50, 75])  # 25th, 50th, and 75th percentiles
    # Display numpy results
    print("Mean:", mean_value)
    print("25th, 50th, 75th Percentiles:", percentiles)
 # %%
 ##########################################
 # Analyze the degree of similarity differences between correct and incorrect results
 # %%
 # compute similarity scores for all values in error_thing_df
 similarity_thing_scores = []
 for idx in error_thing_df.index:
    similarity_thing_scores.append(compute_similarity(idx))
 print_summary(similarity_thing_scores)
 # %%
 similarity_property_scores = []
 for idx in error_property_df.index:
    similarity_property_scores.append(compute_similarity(idx))
 print_summary(similarity_property_scores)
 # %%
 similarity_correct_scores = []
 for idx in correct_df.index:
    similarity_correct_scores.append(compute_similarity(idx))
 print_summary(similarity_correct_scores)
 # %%
 import matplotlib.pyplot as plt
 # Sample data
 list1 = similarity_thing_scores
 list2 = similarity_property_scores
 list3 = similarity_correct_scores
 # Plot histograms
 bins = 50
 plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
 plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
 plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
 # Labels and legend
 plt.xlabel('Value')
 plt.ylabel('Frequency')
 plt.legend(loc='upper right')
 plt.title('Histograms of Three Lists')
 # Show plot
 plt.show()
 # %%
--- a/analysis/t5/find_closest.py
+++ b/analysis/t5/find_closest.py
@ -7,7 +7,7 @@ import glob
 import numpy as np
 # %%
-data_path = f'../data_preprocess/exports/preprocessed_data.csv'
+data_path = f'../../data_preprocess/exports/preprocessed_data.csv'
 df_pre = pd.read_csv(data_path, skipinitialspace=True)
 # %%
@ -18,8 +18,8 @@ desc_list = df_pre['tag_description'].to_list()
 [ elem for elem in desc_list if isinstance(elem, float)]
 ##########################################
 # %%
-fold = 1
+fold = 5
-data_path = f'../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
+data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
@ -74,10 +74,10 @@ class Embedder():
 # %%
-data_path = f"../data_preprocess/exports/dataset/group_{fold}/train.csv"
+data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)
-checkpoint_directory = "../train/mapping_pattern"
+checkpoint_directory = "../../train/mapping_t5_complete_desc_unit"
 directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
 # Use glob to find matching paths
 # path is usually checkpoint_fold_1/checkpoint-<step number>
@ -199,12 +199,15 @@ for select_idx in error_thing_df.index:
    print("status:", result)
    pattern_in_train.append(result)
 # %%
 sum(pattern_in_train)/len(pattern_in_train)
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
-    result = find_back_element_with_print(select_idx)
+    result = find_back_element(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
--- a/analysis/t5/find_closest_mapping_complete.py
+++ b/analysis/t5/find_closest_mapping_complete.py
@ -0,0 +1,334 @@
 # %%
 import pandas as pd
 from utils import Retriever, cosine_similarity_chunked
 import os
 import glob
 import numpy as np
 # %%
 data_path = f'../../data_preprocess/exports/preprocessed_data.csv'
 df_pre = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 # remove nulls or NAs
 df_pre['tag_description'] = df_pre['tag_description'].fillna("NOVALUE")
 df_pre['tag_description'] = df_pre['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
 df_pre['unit'] = df_pre['unit'].fillna("NOVALUE")
 df_pre['unit'] = df_pre['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
 # %%
 # this should be >0 if we are using abbreviations processed data
 desc_list = df_pre['tag_description'].to_list()
 # check for floats
 # we have to eliminate presence of floats
 [ elem for elem in desc_list if isinstance(elem, float)]
 ##########################################
 # %%
 fold = 5
 data_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
 df = pd.read_csv(data_path, skipinitialspace=True)
 # %%
 # subset to mdm
 df = df[df['MDM']]
 # create new fields 'mapping' and 'p_mapping'
 # these are analogous to 'pattern', where we combine 'thing' and 'property' without replacing the numbers
 df['mapping'] = df['thing'] + ' ' + df['property']
 df['p_mapping'] = df['p_thing'] + ' ' + df['p_property']
 thing_condition = df['p_thing'] == df['thing']
 error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
 property_condition = df['p_property'] == df['property']
 error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
 correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
 test_df = df
 # %%
 # thing_df.to_html('thing_errors.html')
 # property_df.to_html('property_errors.html')
 print(len(error_thing_df))
 print(len(error_property_df))
 ##########################################
 # what we need now is understand why the model is making these mispredictions
 # import train data and test data
 # %%
 class Embedder():
    input_df: pd.DataFrame
    fold: int
    def __init__(self, input_df):
        self.input_df = input_df
    def make_embedding(self, checkpoint_path):
        def generate_input_list(df):
            input_list = []
            for _, row in df.iterrows():
                # name = f"<NAME>{row['tag_name']}<NAME>"
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
                # element = f"{name}{desc}"
                element = f"{desc}{unit}"
                input_list.append(element)
            return input_list
        # prepare reference embed
        train_data = list(generate_input_list(self.input_df))
        # Define the directory and the pattern
        retriever_train = Retriever(train_data, checkpoint_path)
        retriever_train.make_mean_embedding(batch_size=64)
        return retriever_train.embeddings.to('cpu')
 # %%
 data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv"
 train_df = pd.read_csv(data_path, skipinitialspace=True)
 train_df['mapping'] = train_df['thing'] + ' ' + train_df['property']
 # remove NAs from train_df
 train_df['tag_description'] = train_df['tag_description'].fillna("NOVALUE")
 train_df['tag_description'] = train_df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
 train_df['unit'] = train_df['unit'].fillna("NOVALUE")
 train_df['unit'] = train_df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
 checkpoint_directory = "../../train/mapping_t5_complete_desc_unit"
 directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
 # Use glob to find matching paths
 # path is usually checkpoint_fold_1/checkpoint-<step number>
 # we are guaranteed to save only 1 checkpoint from training
 pattern = 'checkpoint-*'
 checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
 train_embedder = Embedder(input_df=train_df)
 train_embeds = train_embedder.make_embedding(checkpoint_path)
 test_embedder = Embedder(input_df=test_df)
 test_embeds = test_embedder.make_embedding(checkpoint_path)
 # %%
 # test embeds are inputs since we are looking back at train data
 cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy()
 # %%
 # the following function takes in a full cos_sim_matrix
 # condition_source: boolean selectors of the source embedding
 # condition_target: boolean selectors of the target embedding
 def find_closest(cos_sim_matrix, condition_source, condition_target):
    # subset_matrix = cos_sim_matrix[condition_source]
    # except we are subsetting 2D matrix (row, column)
    subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)]
    # we select top k here
    # Get the indices of the top 5 maximum values along axis 1
    top_k = 3
    top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:]  # Get indices of top k values
    # note that top_k_indices is a nested list because of the 2d nature of the matrix
    # the result is flipped
    top_k_indices[0] = top_k_indices[0][::-1]
    # Get the values of the top 5 maximum scores
    top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1)
    return top_k_indices, top_k_values
 # %%
 error_thing_df.index
 ####################################################
 # special find-back code
 # %%
 def find_back_element_with_print(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
    training_desc_list = train_df.iloc[top_k_indices[0]]['tag_description'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
    test_desc_list = test_df[test_df.index == select_idx]['tag_description'].to_list()
    predicted_test_data = test_df[test_df.index == select_idx]['p_mapping']
    # predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + ' ' + test_df[test_df.index == select_idx]['p_property']
    predicted_test_data = predicted_test_data.to_list()[0]
    print("*" * 80)
    print("idx:", select_idx)
    print("train desc", training_desc_list)
    print("train thing+property", training_data_pattern_list)
    print("test desc", test_desc_list)
    print("test thing+property", test_data_pattern_list)
    print("predicted thing+property", predicted_test_data)
    test_pattern = test_data_pattern_list[0]
    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
    if sum(find_back_list) > 0:
        return True
    else:
        return False
 find_back_element_with_print(0)
 # %%
 def find_back_element(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    training_data_pattern_list = train_df.iloc[top_k_indices[0]]['mapping'].to_list()
    test_data_pattern_list = test_df[test_df.index == select_idx]['mapping'].to_list()
    # print(training_data_pattern_list)
    # print(test_data_pattern_list)
    test_pattern = test_data_pattern_list[0]
    find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ]
    if sum(find_back_list) > 0:
        return True
    else:
        return False
 find_back_element(2884)
 # %%
 # for error thing
 pattern_in_train = []
 for select_idx in error_thing_df.index:
    result = find_back_element_with_print(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 # %%
 sum(pattern_in_train)/len(pattern_in_train)
 ###
 # for error property
 # %%
 pattern_in_train = []
 for select_idx in error_property_df.index:
    result = find_back_element(select_idx)
    print("status:", result)
    pattern_in_train.append(result)
 # %%
 sum(pattern_in_train)/len(pattern_in_train)
 ####################################################
 # %%
 # make function to compute similarity of closest retrieved result
 def compute_similarity(select_idx):
    condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0]
    condition_target = np.ones(train_embeds.shape[0], dtype=bool)
    top_k_indices, top_k_values = find_closest(
        cos_sim_matrix=cos_sim_matrix,
        condition_source=condition_source,
        condition_target=condition_target)
    return np.mean(top_k_values[0])
 # %%
 def print_summary(similarity_scores):
    # Convert list to numpy array for additional stats
    np_array = np.array(similarity_scores)
    # Get stats
    mean_value = np.mean(np_array)
    percentiles = np.percentile(np_array, [25, 50, 75])  # 25th, 50th, and 75th percentiles
    # Display numpy results
    print("Mean:", mean_value)
    print("25th, 50th, 75th Percentiles:", percentiles)
 # %%
 ##########################################
 # Analyze the degree of similarity differences between correct and incorrect results
 # %%
 # compute similarity scores for all values in error_thing_df
 similarity_thing_scores = []
 for idx in error_thing_df.index:
    similarity_thing_scores.append(compute_similarity(idx))
 print_summary(similarity_thing_scores)
 # %%
 similarity_property_scores = []
 for idx in error_property_df.index:
    similarity_property_scores.append(compute_similarity(idx))
 print_summary(similarity_property_scores)
 # %%
 similarity_correct_scores = []
 for idx in correct_df.index:
    similarity_correct_scores.append(compute_similarity(idx))
 print_summary(similarity_correct_scores)
 # %%
 import matplotlib.pyplot as plt
 # Sample data
 list1 = similarity_thing_scores
 list2 = similarity_property_scores
 list3 = similarity_correct_scores
 # Plot histograms
 bins = 50
 plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True)
 plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True)
 plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True)
 # Labels and legend
 plt.xlabel('Value')
 plt.ylabel('Frequency')
 plt.legend(loc='upper right')
 plt.title('Histograms of Three Lists')
 # Show plot
 plt.show()
 ###########################################
 # %%
 # why do similarities of 97% still map correctly?
 score_array = np.array(similarity_correct_scores)
 # %%
 sum(score_array < 0.95)
 # %%
 correct_df[score_array < 0.95]['tag_description'].index.to_list()
 # %%
--- a/data_preprocess/abbreviations/abbreviations_replacer.py
+++ b/data_preprocess/abbreviations/abbreviations_replacer.py
@ -5,7 +5,7 @@ Modified by: Richard Wong
 # %%
 import re
 import pandas as pd
-from replacement_dict import desc_replacement_dict, unit_replacement_dict
+from replacement_dict_new import desc_replacement_dict, unit_replacement_dict
 # %%
 def count_abbreviation_occurrences(tag_descriptions, abbreviation):
@ -48,20 +48,23 @@ df = pd.read_csv(file_path)
 # %%
 # Replace abbreviations
 print("running substitution for descriptions")
-df['tag_description']= df['tag_description'].fillna("NOVALUE")
+# normalize to uppercase
 # strip leading and trailing whitespace
 df['tag_description'] = df['tag_description'].str.strip()
 df['tag_description'] = df['tag_description'].str.upper()
 # Replace whitespace-only entries with "NOVALUE"
 # note that "N/A" can be read as nan
 # replace whitespace only values as NOVALUE
 df['tag_description']= df['tag_description'].fillna("NOVALUE")
 df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
 # perform actual substitution
 tag_descriptions = df['tag_description']
 replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
 replaced_descriptions = cleanup_spaces(replaced_descriptions)
 replaced_descriptions = cleanup_dots(replaced_descriptions)
 df["tag_description"] = replaced_descriptions
 # print("Descriptions after replacement:", replaced_descriptions)
 # strip trailing whitespace
 df['tag_description'] = df['tag_description'].str.rstrip()
 df['tag_description'] = df['tag_description'].str.upper()
 # %%
 print("running substitutions for units")
--- a/data_preprocess/abbreviations/replacement_dict.py
+++ b/data_preprocess/abbreviations/replacement_dict.py
@ -70,7 +70,8 @@ desc_replacement_dict = {
    r'\bD/G\b': 'GENERATOR_ENGINE',
    r'\bGEN\.\b': 'GENERATOR_ENGINE',
    r'\bGENERATOR ENGINE\b': 'GENERATOR_ENGINE',
-    r'\b(\d+)MGE\b': r'NO\1 GENERATOR_ENGINE',
+    # MGE?
    r'\b(\d+)MGE\b': r'NO\1 MAIN_GENERATOR_ENGINE',
    r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
    r'\bENGINE ROOM\b': 'ENGINE ROOM',
    r'\bE/R\b': 'ENGINE ROOM',
--- a/data_preprocess/abbreviations/replacement_dict_new.py
+++ b/data_preprocess/abbreviations/replacement_dict_new.py
@ -0,0 +1,291 @@
 # substitution mapping for descriptions
 # Abbreviations and their replacements
 desc_replacement_dict = {
    r'\bLIST\b': 'LIST',
    # exhaust gas
    r'\bE\. GAS\b': 'EXHAUST GAS',
    r'\bEXH\.\b': 'EXHAUST',
    r'\bEXH\b': 'EXHAUST',
    r'\bEXHAUST\.\b': 'EXHAUST',
    r'\bEXHAUST\b': 'EXHAUST',
    r'\bBLR\.EXH\.\b': 'BOILER EXHAUST',
    # temperature
    r'\bTEMP\.\b': 'TEMPERATURE',
    r'\bTEMP\b': 'TEMPERATURE',
    r'\bTEMPERATURE\.\b': 'TEMPERATURE',
    r'\bTEMPERATURE\b': 'TEMPERATURE',
    # cylinder
    r'\bCYL(\d+)\b': r'CYLINDER\1',
    r'\bCYL\.(\d+)\b': r'CYLINDER\1',
    r'\bCYL(?=\d|\W|$)\b': 'CYLINDER',
    r'\bCYL\.\b': 'CYLINDER',
    r'\bCYL\b': 'CYLINDER',
    # cooling
    r'\bCOOL\.\b': 'COOLING',
    r'\bCOOLING\b': 'COOLING',
    r'\bCOOLER\b': 'COOLER',
    r'\bCW\b': 'COOLING WATER',
    r'\bC\.W\b': 'COOLING WATER',
    r'\bJ\.C\.F\.W\b': 'JACKET COOLING FEED WATER',
    r'\bJ\.C F\.W\b': 'JACKET COOLING FEED WATER',
    r'\bJACKET C\.F\.W\b': 'JACKET COOLING FEED WATER',
    r'\bCOOL\. F\.W\b': 'COOLING FEED WATER',
    r'\bC\.F\.W\b': 'COOLING FEED WATER',
    # sea water
    r'\bC\.S\.W\b': 'COOLING SEA WATER',
    r'\bCSW\b': 'COOLING SEA WATER',
    r'\bC.S.W\b': 'COOLING SEA WATER',
    # water
    r'\bFEED W\.\b': 'FEED WATER',
    r'\bFEED W\b': 'FEED WATER',
    r'\bF\.W\b': 'FEED WATER',
    r'\bF\.W\.\b': 'FEED WATER',
    r'\bFW\b': 'FEED WATER',
    # r'\bWATER\b': 'WATER',
    r'\bSCAV\.\b': 'SCAVENGE',
    r'\bSCAV\b': 'SCAVENGE',
    r'\bINL\.\b': 'INLET',
    r'\bINLET\b': 'INLET',
    r'\bOUT\.\b': 'OUTLET',
    r'\bOUTL\.\b': 'OUTLET',
    r'\bOUTLET\b': 'OUTLET',
    # tank
    r'\bSTOR\.TK\b': 'STORAGE TANK',
    r'\bSTOR\. TK\b': 'STORAGE TANK',
    r'\bSERV\. TK\b': 'SERVICE TANK',
    r'\bSETT\. TK\b': 'SETTLING TANK',
    r'\bBK\b': 'BUNKER',
    r'\bTK\b': 'TANK',
    # PRESSURE
    r'\bPRESS\b': 'PRESSURE',
    r'\bPRESS\.\b': 'PRESSURE',
    r'\bPRESSURE\b': 'PRESSURE',
    r'PRS\b': 'PRESSURE',  # this is a special replacement - it is safe to replace PRS w/o checks
    # ENGINE
    r'\bENG\.\b': 'ENGINE',
    r'\bENG\b': 'ENGINE',
    r'\bENGINE\b': 'ENGINE',
    r'\bENGINE SPEED\b': 'ENGINE SPEED',
    r'\bENGINE RUNNING\b': 'ENGINE RUNNING',
    r'\bENGINE RPM PICKUP\b': 'ENGINE RPM PICKUP',
    r'\bENGINE ROOM\b': 'ENGINE ROOM',
    r'\bE/R\b': 'ENGINE ROOM',
    # MAIN ENGINE
    r'\bM/E NO.(\d+)\b': r'NO\1 MAIN_ENGINE',
    r'\bM/E NO(\d+)\b': r'NO\1 MAIN_ENGINE',
    r'\bM/E  NO.(\d+)\b': r'NO\1 MAIN_ENGINE',
    r'\bME NO.(\d+)\b': r'NO\1 MAIN_ENGINE',
    r'\bM/E\b': 'MAIN_ENGINE',
    r'\bM/E(.)\b': r'MAIN_ENGINE \1', # M/E(S/P)
    r'\bME(.)\b': r'MAIN_ENGINE \1', # ME(S/P)
    r'\bM_E\b': 'MAIN_ENGINE',
    r'\bME(?=\d|\W|$)\b': 'MAIN_ENGINE',
    r'\bMAIN ENGINE\b': 'MAIN_ENGINE',
    # ENGINE variants
    r'\bM_E_RPM\b': 'MAIN ENGINE RPM',
    r'\bM/E_M\.G\.O\.\b': 'MAIN ENGINE MARINE GAS OIL',
    r'\bM/E_H\.F\.O\.\b': 'MAIN ENGINE HEAVY FUEL OIL',
    # GENERATOR ENGINE
    r'\bGEN(\d+)\b': r'NO\1 GENERATOR_ENGINE',
    r'\bGE(\d+)\b': r'NO\1 GENERATOR_ENGINE',
    # ensure that we substitute only for terms where following GE is num or special
    r'\bGE(?=\d|\W|$)\b': 'GENERATOR_ENGINE',
    r'\bG/E(\d+)\b': r'NO\1 GENERATOR_ENGINE',
    r'\bG/E\b': r'GENERATOR_ENGINE',
    r'\bG_E(\d+)\b': r'NO\1 GENERATOR_ENGINE',
    r'\bG_E\b': 'GENERATOR_ENGINE',
    r'\bGENERATOR ENGINE\b': 'GENERATOR_ENGINE',
    r'\bG/E_M\.G\.O\b': 'GENERATOR_ENGINE MARINE GAS OIL',
    # DG
    r'\bDG(\d+)\b': r'NO\1 GENERATOR_ENGINE',
    r'\bDG\b': 'GENERATOR_ENGINE',
    r'\bD/G\b': 'GENERATOR_ENGINE',
    r'\bDG(\d+)\((.)\)\b': r'NO\1\2 GENERATOR_ENGINE', # handle DG2(A)
    r'\bDG(\d+[A-Za-z])\b': r'NO\1 GENERATOR_ENGINE', # handle DG2A
    # DG variants
    r'\bDG_CURRENT\b': 'GENERATOR_ENGINE CURRENT',
    r'\bDG_LOAD\b': 'GENERATOR_ENGINE LOAD',
    r'\bDG_FREQUENCY\b': 'GENERATOR_ENGINE FREQUENCY',
    r'\bDG_VOLTAGE\b': 'GENERATOR_ENGINE VOLTAGE',
    r'\bDG_CLOSED\b': 'GENERATOR_ENGINE CLOSED',
    r'\bD/G_CURRENT\b': 'GENERATOR_ENGINE CURRENT',
    r'\bD/G_LOAD\b': 'GENERATOR_ENGINE LOAD',
    r'\bD/G_FREQUENCY\b': 'GENERATOR_ENGINE FREQUENCY',
    r'\bD/G_VOLTAGE\b': 'GENERATOR_ENGINE VOLTAGE',
    r'\bD/G_CLOSED\b': 'GENERATOR_ENGINE CLOSED',
    # MGE
    r'\b(\d+)MGE\b': r'NO\1 MAIN_GENERATOR_ENGINE',
    # generator engine and mgo
    r'\bG/E_M\.G\.O\.\b': r'GENERATOR_ENGINE MARINE GAS OIL',
    r'\bG/E_H\.F\.O\.\b': r'GENERATOR_ENGINE HEAVY FUEL OIL',
    # ultra low sulfur fuel oil
    r'\bU\.L\.S\.F\.O\b': 'ULTRA LOW SULFUR FUEL OIL',
    r'\bULSFO\b': 'ULTRA LOW SULFUR FUEL OIL',
    # marine gas oil
    r'\bM\.G\.O\b': 'MARINE GAS OIL',
    r'\bMGO\b': 'MARINE GAS OIL',
    r'\bMDO\b': 'MARINE DIESEL OIL',
    # light fuel oil
    r'\bL\.F\.O\b': 'LIGHT FUEL OIL',
    r'\bLFO\b': 'LIGHT FUEL OIL',
    # heavy fuel oil
    r'\bHFO\b': 'HEAVY FUEL OIL',
    r'\bH\.F\.O\b': 'HEAVY FUEL OIL',
    # piston cooling oil
    r'\bPCO\b': 'PISTON COOLING OIL',
    r'\bP\.C\.O\.\b': 'PISTON COOLING OIL',
    r'\bP\.C\.O\b': 'PISTON COOLING OIL',
    r'PISTION C.O': 'PISTON COOLING OIL',
    # diesel oil
    r'\bD.O\b': 'DIESEL OIL',
    # for remaining fuel oil that couldn't be substituted
    r'\bF\.O\b': 'FUEL OIL',
    r'\bFO\b': 'FUEL OIL',
    # lubricant
    r'\bLUB\.\b': 'LUBRICANT',
    r'\bLUBE\b': 'LUBRICANT',
    r'\bLUBR\.\b': 'LUBRICANT',
    r'\bLUBRICATING\.\b': 'LUBRICANT',
    r'\bLUBRICATION\.\b': 'LUBRICANT',
    # lubricating oil
    r'\bL\.O\b': 'LUBRICATING OIL',
    r'\bLO\b': 'LUBRICATING OIL',
    # lubricating oil pressure
    r'\bLO_PRESS\b': 'LUBRICATING OIL PRESSURE',
    r'\bLO_PRESSURE\b': 'LUBRICATING OIL PRESSURE',
    # temperature
    r'\bL\.T\b': 'LOW TEMPERATURE',
    r'\bLT\b': 'LOW TEMPERATURE',
    r'\bH\.T\b': 'HIGH TEMPERATURE',
    r'\bHT\b': 'HIGH TEMPERATURE',
    # BOILER
    # auxiliary boiler
    # replace these first before replacing AUXILIARY only
    r'\bAUX\.BOILER\b': 'AUXILIARY BOILER',
    r'\bAUX\. BOILER\b': 'AUXILIARY BOILER',
    r'\bAUX BLR\b': 'AUXILIARY BOILER',
    r'\bAUX\.\b': 'AUXILIARY',
    r'\bAUX\b': 'AUXILIARY',
    # composite boiler
    r'\bCOMP\. BOILER\b': 'COMPOSITE BOILER',
    r'\bCOMP\.BOILER\b': 'COMPOSITE BOILER',
    r'\bCOMP BOILER\b': 'COMPOSITE BOILER',
    r'\bCOMP\b': 'COMPOSITE',
    r'\bCMPS\b': 'COMPOSITE',
    # any other boiler
    r'\bBLR\.\b': 'BOILER',
    r'\bBLR\b': 'BOILER',
    r'\bBOILER W.CIRC.P/P\b': 'BOILER WATER CIRC P/P',
    # windind
    r'\bWIND\.\b': 'WINDING',
    r'\bWINDING\b': 'WINDING',
    # VOLTAGE/FREQ/CURRENT
    r'\bVLOT\.': 'VOLTAGE', # correct spelling
    r'\bVOLT\.': 'VOLTAGE',
    r'\bVOLTAGE\b': 'VOLTAGE',
    r'\bFREQ\.': 'FREQUENCY',
    r'\bFREQUENCY\b': 'FREQUENCY',
    r'\bCURR\.': 'CURRENT',
    r'\bCURRENT\b': 'CURRENT',
    # TURBOCHARGER
    r'\bTCA\b': 'TURBOCHARGER',
    r'\bTCB\b': 'TURBOCHARGER',
    r'\bT/C\b': 'TURBOCHARGER',
    r'\bT_C\b': 'TURBOCHARGER',
    r'\bT/C_RPM\b': 'TURBOCHARGER RPM',
    r'\bTC(\d+)\b': r'TURBOCHARGER\1',
    r'\bT/C(\d+)\b': r'TURBOCHARGER\1',
    r'\bTC(?=\d|\W|$)\b': 'TURBOCHARGER',
    r'\bTURBOCHAGER\b': 'TURBOCHARGER',
    r'\bTURBOCHARGER\b': 'TURBOCHARGER',
    r'\bTURBOCHG\b': 'TURBOCHARGER',
    # misc spelling errors
    r'\bOPERATOIN\b': 'OPERATION',
    # wrongly attached terms
    r'\bBOILERMGO\b': 'BOILER MGO',
    # additional standardizing replacement
    # replace # followed by a number with NO
    r'#(?=\d)\b': 'NO',
    r'\bNO\.(?=\d)\b': 'NO',
    r'\bNO\.\.(?=\d)\b': 'NO',
    # others:
    # generator
    r'\bGEN\.\b': 'GENERATOR',
    # others
    r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
    r'\bFLTR\b': 'FILTER',
    r'\bCLR\b': 'CLEAR',
 }
 # substitution mapping for units
 # Abbreviations and their replacements
 unit_replacement_dict = {
    r'\b%\b': 'PERCENT',
    r'\b-\b': '',
    r'\b-  \b': '',
    # ensure no character after A
    r'\bA(?!\w|/)': 'CURRENT',
    r'\bAmp(?!\w|/)': 'CURRENT',
    r'\bHz\b': 'HERTZ',
    r'\bKG/CM2\b': 'PRESSURE',
    r'\bKG/H\b': 'KILOGRAM PER HOUR',
    r'\bKNm\b': 'RPM',
    r'\bKW\b': 'POWER',
    r'\bKg(?!\w|/)': 'MASS',
    r'\bKw\b': 'POWER',
    r'\bL(?!\w|/)': 'VOLUME',
    r'\bMT/h\b': 'METRIC TONNES PER HOUR',
    r'\bMpa\b': 'PRESSURE',
    r'\bPF\b': 'POWER FACTOR',
    r'\bRPM\b': 'RPM',
    r'\bV(?!\w|/)': 'VOLTAGE',
    r'\bbar(?!\w|/)': 'PRESSURE',
    r'\bbarA\b': 'SCAVENGE PRESSURE',
    r'\bcST\b': 'VISCOSITY',
    r'\bcSt\b': 'VISCOSITY',
    r'\bcst\b': 'VISCOSITY',
    r'\bdeg(?!\w|/|\.)': 'DEGREE',
    r'\bdeg.C\b': 'TEMPERATURE',
    r'\bdegC\b': 'TEMPERATURE',
    r'\bdegree\b': 'DEGREE',
    r'\bdegreeC\b': 'TEMPERATURE',
    r'\bhPa\b': 'PRESSURE',
    r'\bhours\b': 'HOURS',
    r'\bkN\b': 'THRUST',
    r'\bkNm\b': 'TORQUE',
    r'\bkW\b': 'POWER',
    # ensure that kg is not followed by anything
    r'\bkg(?!\w|/)': 'FLOW', # somehow in the data its flow
    r'\bkg/P\b': 'MASS FLOW',
    r'\bkg/cm2\b': 'PRESSURE',
    r'\bkg/cm²\b': 'PRESSURE',
    r'\bkg/h\b': 'MASS FLOW',
    r'\bkg/hr\b': 'MASS FLOW',
    r'\bkg/pulse\b': '',
    r'\bkgf/cm2\b': 'PRESSURE',
    r'\bkgf/cm²\b': 'PRESSURE',
    r'\bkgf/㎠\b': 'PRESSURE',
    r'\bknots\b': 'SPEED',
    r'\bkw\b': 'POWER',
    r'\bl/Hr\b': 'VOLUME FLOW',
    r'\bl/h\b': 'VOLUME FLOW',
    r'\bl_Hr\b': 'VOLUME FLOW',
    r'\bl_hr\b': 'VOLUME FLOW',
    r'\bM\b': 'DRAFT', # for wind draft
    r'm': 'm', # wind draft and trim - not useful
    r'\bm/s\b': 'SPEED',
    r'\bm3\b': 'VOLUME',
    r'\bmH2O\b': 'DRAFT',
    r'\bmWC\b': 'DRAFT',
    r'\bmbar\b': 'PRESSURE',
    r'\bmg\b': 'ACCELERATION',
    r'\bmin-¹\b': '', # data too varied
    r'\bmm\b': '', # data too varied
    r'\bmmH2O\b': 'WATER DRUM LEVEL',
    r'\brev\b': 'RPM',
    r'\brpm\b': 'RPM',
    r'\bx1000min-¹\b': '',
    r'\b°C\b': 'TEMPERATURE',
    r'\bºC\b': 'TEMPERATURE',
    r'\b℃\b': 'TEMPERATURE'
 }
--- a/data_preprocess/check_data/.gitignore
+++ b/data_preprocess/check_data/.gitignore
@ -0,0 +1 @@
 *.csv
--- a/data_preprocess/check_data/check.py
+++ b/data_preprocess/check_data/check.py
@ -53,6 +53,17 @@ with open('output.txt', 'w') as file:
 # %%
-test = 'kg/cm3'
+test = 'M/E(S) something'
-print(re.sub(r'kg(?!\w|/)', 'flow', test))
+print(re.sub(r'\bM/E(.)', r'MAINE ENGINE \1', test))
 # %%
 test = 'NO.345A ENGINE'
 print(re.sub(r'\bNO\.(?=\d)\b', r'NO', test))
 # %%
 test = 'S/G VLOT.'
 print(re.sub(r'VLOT\.', 'VOLT', test))
 # %%
 description = 'NO3 GENERATOR WINDING TEMPERATURE(T)'
 re.sub(r'\s+', ' ', description)
--- a/data_preprocess/check_data/desc.csv
+++ b/data_preprocess/check_data/desc.csv
--- a/post_process/binary_classifier/classification_prediction/output.txt
+++ b/post_process/binary_classifier/classification_prediction/output.txt
@ -1,31 +1,31 @@
 ********************************************************************************
 Fold: 1
-Accuracy: 0.95342
+Accuracy: 0.95174
-F1 Score: 0.91344
+F1 Score: 0.90912
-Precision: 0.91643
+Precision: 0.91788
-Recall: 0.91052
+Recall: 0.90092
 ********************************************************************************
 Fold: 2
-Accuracy: 0.95402
+Accuracy: 0.95159
-F1 Score: 0.92950
+F1 Score: 0.92593
-Precision: 0.92122
+Precision: 0.91697
-Recall: 0.93848
+Recall: 0.93574
 ********************************************************************************
 Fold: 3
-Accuracy: 0.95200
+Accuracy: 0.95373
-F1 Score: 0.92726
+F1 Score: 0.93021
-Precision: 0.91825
+Precision: 0.91935
-Recall: 0.93712
+Recall: 0.94233
 ********************************************************************************
 Fold: 4
-Accuracy: 0.96473
+Accuracy: 0.96524
-F1 Score: 0.92708
+F1 Score: 0.92902
-Precision: 0.91566
+Precision: 0.91306
-Recall: 0.93950
+Recall: 0.94702
 ********************************************************************************
 Fold: 5
-Accuracy: 0.95605
+Accuracy: 0.95643
-F1 Score: 0.92244
+F1 Score: 0.92319
-Precision: 0.91755
+Precision: 0.91793
-Recall: 0.92754
+Recall: 0.92869
--- a/post_process/binary_classifier/classification_prediction/predict.py
+++ b/post_process/binary_classifier/classification_prediction/predict.py
@ -98,7 +98,7 @@ def test(fold):
    # %%
-    max_length = 64
+    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
--- a/post_process/binary_classifier/train.py
+++ b/post_process/binary_classifier/train.py
@ -74,6 +74,15 @@ def create_split_dataset(fold):
    full_df = pd.read_csv(data_path, skipinitialspace=True)
    train_df = full_df[~full_df['ships_idx'].isin(ships_list)]
    train_ships_list = sorted(list(set(train_df['ships_idx'])))
    train_ships_set = set(train_ships_list)
    test_ships_set = set(ships_list)
    # assertion for non data leakage
    assert not set(train_ships_set).intersection(test_ships_set)
    # valid
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
--- a/post_process/similarity_classifier/output.txt
+++ b/post_process/similarity_classifier/output.txt
@ -0,0 +1,31 @@
 Fold: 1
 Best threshold: 0.9775
 Accuracy: 0.92512
 F1 Score: 0.76313
 Precision: 0.78069
 Recall: 0.74633
 Fold: 2
 Best threshold: 0.9775
 Accuracy: 0.92054
 F1 Score: 0.81117
 Precision: 0.77150
 Recall: 0.85514
 Fold: 3
 Best threshold: 0.985
 Accuracy: 0.93201
 F1 Score: 0.83578
 Precision: 0.81657
 Recall: 0.85592
 Fold: 4
 Best threshold: 0.9924999999999999
 Accuracy: 0.95334
 F1 Score: 0.82722
 Precision: 0.83341
 Recall: 0.82112
 Fold: 5
 Best threshold: 0.9924999999999999
 Accuracy: 0.92968
 F1 Score: 0.77680
 Precision: 0.83395
 Recall: 0.72698
--- a/post_process/similarity_classifier/run.py
+++ b/post_process/similarity_classifier/run.py
@ -50,7 +50,8 @@ class Embedder():
            for _, row in df.iterrows():
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
-                element = f"{desc}{unit}"
+                name = f"<NAME>{row['tag_name']}<NAME"
                element = f"{name}{desc}{unit}"
                input_list.append(element)
            return input_list
@ -64,7 +65,7 @@ class Embedder():
 def run_similarity_classifier(fold):
-    data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
+    data_path = f'../../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv'
    test_df = pd.read_csv(data_path, skipinitialspace=True)
@ -72,7 +73,7 @@ def run_similarity_classifier(fold):
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
-    checkpoint_directory = "../../train/classification_bert_complete_desc_unit"
+    checkpoint_directory = "../../train/classification_bert_complete_desc_unit_name"
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
@ -109,26 +110,54 @@ def run_similarity_classifier(fold):
        sim_list.append(top_sim_value)
    # analysis 1: using threshold to perform find-back prediction success
-    threshold = 0.90
+    threshold_values = np.linspace(0.85, 1.00, 21) # test 20 values, 21 to get nice round numbers
-    predict_list = [ elem > threshold for elem in sim_list ]
+    best_threshold = 0
    best_f1 = 0
    for threshold in threshold_values:
        predict_list = [ elem > threshold for elem in sim_list ]
        y_true = test_df['MDM'].to_list()
        y_pred = predict_list
        # Compute metrics
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        if f1 > best_f1:
            best_threshold = threshold
            best_f1 = f1
    # compute metrics again with best threshold
    predict_list = [ elem > best_threshold for elem in sim_list ]
    y_true = test_df['MDM'].to_list()
    y_pred = predict_list
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
-    # Print the results
+
-    print(f'Accuracy: {accuracy:.5f}')
+
-    print(f'F1 Score: {f1:.5f}')
+    with open("output.txt", "a") as f:
-    print(f'Precision: {precision:.5f}')
+
-    print(f'Recall: {recall:.5f}')
+        print(f'Fold: {fold}', file=f)
        print(f'Best threshold: {best_threshold}', file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
 for fold in [1,2,3,4,5]:
    print(fold)
    run_similarity_classifier(fold)
--- a/train/classification_bert_complete_desc/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc/classification_prediction/output.txt
@ -1,31 +1,31 @@
 ********************************************************************************
 Fold: 1
-Accuracy: 0.76337
+Accuracy: 0.78277
-F1 Score: 0.37980
+F1 Score: 0.73629
-Precision: 0.36508
+Precision: 0.71419
-Recall: 0.41523
+Recall: 0.78277
 ********************************************************************************
 Fold: 2
-Accuracy: 0.77430
+Accuracy: 0.78598
-F1 Score: 0.40473
+F1 Score: 0.73708
-Precision: 0.39528
+Precision: 0.71578
-Recall: 0.43303
+Recall: 0.78598
 ********************************************************************************
 Fold: 3
-Accuracy: 0.77259
+Accuracy: 0.79819
-F1 Score: 0.39538
+F1 Score: 0.74411
-Precision: 0.37761
+Precision: 0.71749
-Recall: 0.43633
+Recall: 0.79819
 ********************************************************************************
 Fold: 4
-Accuracy: 0.77545
+Accuracy: 0.79543
-F1 Score: 0.39792
+F1 Score: 0.73902
-Precision: 0.38636
+Precision: 0.71094
-Recall: 0.43003
+Recall: 0.79543
 ********************************************************************************
 Fold: 5
-Accuracy: 0.74897
+Accuracy: 0.77279
-F1 Score: 0.38827
+F1 Score: 0.72098
-Precision: 0.37680
+Precision: 0.69817
-Recall: 0.42382
+Recall: 0.77279
--- a/train/classification_bert_complete_desc/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc/classification_prediction/predict.py
@ -27,6 +27,9 @@ from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # we need to create the mdm_list
@ -185,7 +188,6 @@ def test(fold):
    actual_labels = []
    BATCH_SIZE = 64
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
@ -217,9 +219,11 @@ def test(fold):
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
-    f1 = f1_score(y_true, y_pred, average='macro')
+    average_parameter = 'weighted'
-    precision = precision_score(y_true, y_pred, average='macro')
+    zero_division_parameter = 0
-    recall = recall_score(y_true, y_pred, average='macro')
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
--- a/train/classification_bert_complete_desc/train.py
+++ b/train/classification_bert_complete_desc/train.py
@ -57,7 +57,7 @@ for idx, val in enumerate(mdm_list):
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
-        desc = f"{row['tag_description']}"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = f"{row['thing'] + row['property']}"
        try:
            index = mdm_list.index(pattern)
@ -100,7 +100,7 @@ def train(fold):
    # prepare tokenizer
    # model_checkpoint = "distilbert/distilbert-base-uncased"
-    model_checkpoint = 'google-bert/bert-base-uncased'
+    model_checkpoint = 'google-bert/bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -177,8 +177,8 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
-        per_device_train_batch_size=64,
+        per_device_train_batch_size=128,
-        per_device_eval_batch_size=64,
+        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
--- a/train/classification_bert_complete_desc_unit/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt
@ -1,31 +1,31 @@
 ********************************************************************************
 Fold: 1
-Accuracy: 0.77946
+Accuracy: 0.78940
-F1 Score: 0.40686
+F1 Score: 0.73284
-Precision: 0.39833
+Precision: 0.70389
-Recall: 0.43814
+Recall: 0.78940
 ********************************************************************************
 Fold: 2
-Accuracy: 0.78271
+Accuracy: 0.78411
-F1 Score: 0.42730
+F1 Score: 0.73695
-Precision: 0.42002
+Precision: 0.71914
-Recall: 0.45670
+Recall: 0.78411
 ********************************************************************************
 Fold: 3
-Accuracy: 0.78715
+Accuracy: 0.80522
-F1 Score: 0.41108
+F1 Score: 0.75406
-Precision: 0.39829
+Precision: 0.72847
-Recall: 0.44992
+Recall: 0.80522
 ********************************************************************************
 Fold: 4
-Accuracy: 0.79115
+Accuracy: 0.80780
-F1 Score: 0.41810
+F1 Score: 0.75361
-Precision: 0.40095
+Precision: 0.72432
-Recall: 0.45760
+Recall: 0.80780
 ********************************************************************************
 Fold: 5
-Accuracy: 0.76271
+Accuracy: 0.76958
-F1 Score: 0.41752
+F1 Score: 0.71912
-Precision: 0.41156
+Precision: 0.69965
-Recall: 0.44899
+Recall: 0.76958
--- a/train/classification_bert_complete_desc_unit/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py
@ -27,6 +27,9 @@ from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 128
 # %%
 # we need to create the mdm_list
@ -123,7 +126,7 @@ def test(fold):
    # %%
-    max_length = 64
+    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
@ -185,7 +188,6 @@ def test(fold):
    actual_labels = []
    BATCH_SIZE = 64
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
@ -217,9 +219,13 @@ def test(fold):
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
-    f1 = f1_score(y_true, y_pred, average='macro')
+    average_parameter = 'weighted'
-    precision = precision_score(y_true, y_pred, average='macro')
+    zero_division_parameter = 0
-    recall = recall_score(y_true, y_pred, average='macro')
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
--- a/train/classification_bert_complete_desc_unit/train.py
+++ b/train/classification_bert_complete_desc_unit/train.py
@ -57,9 +57,9 @@ for idx, val in enumerate(mdm_list):
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
-        desc = f"{row['tag_description']}"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = f"{row['thing'] + row['property']}"
        unit = f"<UNIT>{row['unit']}<UNIT>"
        pattern = f"{row['thing'] + row['property']}"
        try:
            index = mdm_list.index(pattern)
        except ValueError:
@ -101,7 +101,7 @@ def train(fold):
    # prepare tokenizer
    # model_checkpoint = "distilbert/distilbert-base-uncased"
-    model_checkpoint = 'google-bert/bert-base-uncased'
+    model_checkpoint = 'google-bert/bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -178,8 +178,8 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
-        per_device_train_batch_size=64,
+        per_device_train_batch_size=128,
-        per_device_eval_batch_size=64,
+        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
--- a/train/classification_bert_complete_desc_unit_name/.gitignore
+++ b/train/classification_bert_complete_desc_unit_name/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc_unit_name/classification_prediction/output.txt
@ -0,0 +1,31 @@
 ********************************************************************************
 Fold: 1
 Accuracy: 0.68859
 F1 Score: 0.62592
 Precision: 0.60775
 Recall: 0.68859
 ********************************************************************************
 Fold: 2
 Accuracy: 0.72150
 F1 Score: 0.65739
 Precision: 0.63652
 Recall: 0.72150
 ********************************************************************************
 Fold: 3
 Accuracy: 0.72038
 F1 Score: 0.65781
 Precision: 0.63249
 Recall: 0.72038
 ********************************************************************************
 Fold: 4
 Accuracy: 0.74167
 F1 Score: 0.68167
 Precision: 0.65489
 Recall: 0.74167
 ********************************************************************************
 Fold: 5
 Accuracy: 0.67705
 F1 Score: 0.61273
 Precision: 0.59472
 Recall: 0.67705
--- a/train/classification_bert_complete_desc_unit_name/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc_unit_name/classification_prediction/predict.py
@ -0,0 +1,248 @@
 # %%
 # from datasets import load_from_disk
 import os
 import glob
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 from tqdm import tqdm
 torch.set_float32_matmul_precision('high')
 BATCH_SIZE = 256
 # %%
 # we need to create the mdm_list
 # import the full mdm-only file
 data_path = '../../../data_import/exports/data_mapping_mdm.csv'
 full_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 # mdm_list = sorted(list((set(full_df['pattern']))))
 thing_property = full_df['thing'] + full_df['property']
 thing_property = thing_property.to_list()
 mdm_list = sorted(list(set(thing_property)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(mdm_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
        name = f"<NAME>{row['tag_name']}<NAME>"
        desc = f"<DESC>{row['tag_description']}<DESC>"
        unit = f"<UNIT>{row['unit']}<UNIT>"
        pattern = f"{row['thing'] + row['property']}"
        try:
            index = mdm_list.index(pattern)
        except ValueError:
            index = -1
        element = {
            'text' : f"{name}{desc}{unit}",
            'label': index,
        }
        output_list.append(element)
    return output_list
 def create_dataset(fold, mdm_list):
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    test_df = pd.read_csv(data_path, skipinitialspace=True)
    # we only use the mdm subset
    test_df = test_df[test_df['MDM']].reset_index(drop=True)
    test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
    return test_dataset
 # %%
 # function to perform training for a given fold
 def test(fold):
    test_dataset = create_dataset(fold, mdm_list)
    # prepare tokenizer
    checkpoint_directory = f'../checkpoint_fold_{fold}'
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    # %%
    # compute max token length
    max_length = 0
    for sample in test_dataset['text']:
        # Tokenize the sample and get the length
        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
        length = len(input_ids)
        # Update max_length if this sample is longer
        if length > max_length:
            max_length = length
    print(max_length)
    # %%
    max_length = 128
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            # truncation=True,
            padding='max_length'
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    datasets = test_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
    # %%
    # compute metrics
    # metric = evaluate.load("accuracy")
    # 
    # 
    # def compute_metrics(eval_preds):
    #     preds, labels = eval_preds
    #     preds = np.argmax(preds, axis=1)
    #     return metric.compute(predictions=preds, references=labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(mdm_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    model = model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    pred_labels = []
    actual_labels = []
    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
    for batch in tqdm(dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            actual_labels.extend(batch['label'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Perform inference
            with torch.no_grad():
                logits = model(
                    input_ids,
                    attention_mask).logits
                predicted_class_ids = logits.argmax(dim=1).to("cpu")
                pred_labels.extend(predicted_class_ids)
    pred_labels = [tensor.item() for tensor in pred_labels]
    # %%
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
    y_true = actual_labels
    y_pred = pred_labels
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    average_parameter = 'weighted'
    zero_division_parameter = 0
    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
    with open("output.txt", "a") as f:
        print('*' * 80, file=f)
        print(f'Fold: {fold}', file=f)
        # Print the results
        print(f'Accuracy: {accuracy:.5f}', file=f)
        print(f'F1 Score: {f1:.5f}', file=f)
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)
 # %%
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
 for fold in [1,2,3,4,5]:
    test(fold)
--- a/train/classification_bert_complete_desc_unit_name/train.py
+++ b/train/classification_bert_complete_desc_unit_name/train.py
@ -0,0 +1,218 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    EarlyStoppingCallback,
    TrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # %%
 # we need to create the mdm_list
 # import the full mdm-only file
 data_path = '../../data_import/exports/data_mapping_mdm.csv'
 full_df = pd.read_csv(data_path, skipinitialspace=True)
 # rather than use pattern, we use the real thing and property
 # mdm_list = sorted(list((set(full_df['pattern']))))
 thing_property = full_df['thing'] + full_df['property']
 thing_property = thing_property.to_list()
 mdm_list = sorted(list(set(thing_property)))
 # %%
 id2label = {}
 label2id = {}
 for idx, val in enumerate(mdm_list):
    id2label[idx] = val
    label2id[val] = idx
 # %%
 # outputs a list of dictionaries
 # processes dataframe into lists of dictionaries
 # each element maps input to output
 # input: tag_description
 # output: class label
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
        name = f"<NAME>{row['tag_name']}<NAME>"
        desc = f"<DESC>{row['tag_description']}<DESC>"
        unit = f"<UNIT>{row['unit']}<UNIT>"
        pattern = f"{row['thing'] + row['property']}"
        try:
            index = mdm_list.index(pattern)
        except ValueError:
            print("Error: value not found in MDM list")
            index = -1
        element = {
            'text' : f"{name}{desc}{unit}",
            'label': index,
        }
        output_list.append(element)
    return output_list
 def create_split_dataset(fold, mdm_list):
    # train 
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # valid
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
        'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
    })
    return combined_data
 # %%
 # function to perform training for a given fold
 def train(fold):
    save_path = f'checkpoint_fold_{fold}'
    split_datasets = create_split_dataset(fold, mdm_list)
    # prepare tokenizer
    # model_checkpoint = "distilbert/distilbert-base-uncased"
    model_checkpoint = 'google-bert/bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['text']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            max_length=max_length,
            truncation=True,
            padding=True
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns="text",
    )
    # %% temp
    # tokenized_datasets['train'].rename_columns()
    # %%
    # create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # %%
    # compute metrics
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=labels)
    # %%
    # create id2label and label2id
    # %%
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(mdm_list),
        id2label=id2label,
        label2id=label2id)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # %%
    # Trainer
    training_args = TrainingArguments(
        output_dir=f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=80,
        bf16=True,
        push_to_hub=False,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 for fold in [1,2,3,4,5]:
    print(fold)
    train(fold)
 # %%
--- a/train/classification_bert_pattern_desc/train.py
+++ b/train/classification_bert_pattern_desc/train.py
@ -52,7 +52,7 @@ for idx, val in enumerate(mdm_list):
 def process_df_to_dict(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
-        desc = f"{row['tag_description']}"
+        desc = f"<DESC>{row['tag_description']}<DESC>"
        pattern = row['pattern']
        try:
            index = mdm_list.index(pattern)
--- a/train/mapping_t5_complete_desc/.gitignore
+++ b/train/mapping_t5_complete_desc/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/train/mapping_t5_complete_desc/mapping_prediction/.gitignore
+++ b/train/mapping_t5_complete_desc/mapping_prediction/.gitignore
@ -0,0 +1,2 @@
 __pycache__
 exports/
--- a/train/mapping_t5_complete_desc/mapping_prediction/inference.py
+++ b/train/mapping_t5_complete_desc/mapping_prediction/inference.py
@ -0,0 +1,168 @@
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
 )
 import os
 from tqdm import tqdm
 from datasets import Dataset
 import numpy as np
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 class Inference():
    tokenizer: T5TokenizerFast
    model: torch.nn.Module
    dataloader: DataLoader
    def __init__(self, checkpoint_path):
        self._create_tokenizer()
        self._load_model(checkpoint_path)
    def _create_tokenizer(self):
        # %%
        # load tokenizer
        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
        # Define additional special tokens
        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
        # Add the additional special tokens to the tokenizer
        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    def _load_model(self, checkpoint_path: str):
        # load model
        # Define the directory and the pattern
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
        model = torch.compile(model)
        # set model to eval
        self.model = model.eval()
    def prepare_dataloader(self, input_df, batch_size, max_length):
        """
        *arguments*
        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
        - batch_size: the batch size of dataloader output
        - max_length: length of tokenizer output
        """
        print("preparing dataloader")
        # convert each dataframe row into a dictionary
        # outputs a list of dictionaries
        def _process_df(df):
            output_list = []
            for _, row in df.iterrows():
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
                element = {
                    'input' : f"{desc}{unit}",
                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
                }
                output_list.append(element)
            return output_list
        def _preprocess_function(example):
            input = example['input']
            target = example['output']
            # text_target sets the corresponding label to inputs
            # there is no need to create a separate 'labels'
            model_inputs = self.tokenizer(
                input,
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
                padding='max_length',
                truncation=True,
            )
            return model_inputs
        test_dataset = Dataset.from_list(_process_df(input_df))
        # map maps function to each "row" in the dataset
        # aka the data in the immediate nesting
        datasets = test_dataset.map(
            _preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=test_dataset.column_names,
        )
        # datasets = _preprocess_function(test_dataset)
        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        # create dataloader
        self.dataloader = DataLoader(datasets, batch_size=batch_size)
    def generate(self):
        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128
        pred_generations = []
        pred_labels = []
        print("start generation")
        for batch in tqdm(self.dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            pred_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            self.model.to(device)
            # Perform inference
            with torch.no_grad():
                outputs = self.model.generate(input_ids,
                                        attention_mask=attention_mask,
                                        max_length=MAX_GENERATE_LENGTH)
                # Decode the output and print the results
                pred_generations.extend(outputs.to("cpu"))
        # %%
        # extract sequence and decode
        def extract_seq(tokens, start_value, end_value):
            if start_value not in tokens or end_value not in tokens:
                return None  # Or handle this case according to your requirements
            start_id = np.where(tokens == start_value)[0][0]
            end_id = np.where(tokens == end_value)[0][0]
            return tokens[start_id+1:end_id]
        def process_tensor_output(tokens):
            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
            p_thing = None
            p_property = None
            if (thing_seq is not None):
                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
            if (property_seq is not None):
                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
            return p_thing, p_property
        # decode prediction labels
        def decode_preds(tokens_list):
            thing_prediction_list = []
            property_prediction_list = []
            for tokens in tokens_list:
                p_thing, p_property = process_tensor_output(tokens)
                thing_prediction_list.append(p_thing)
                property_prediction_list.append(p_property)
            return thing_prediction_list, property_prediction_list 
        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
        return thing_prediction_list, property_prediction_list
--- a/train/mapping_t5_complete_desc/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc/mapping_prediction/output.txt
@ -0,0 +1,6 @@
 Accuracy for fold 1: 0.9455750118315192
 Accuracy for fold 2: 0.8864485981308411
 Accuracy for fold 3: 0.9558232931726908
 Accuracy for fold 4: 0.9686013320647003
 Accuracy for fold 5: 0.896930829134219
--- a/train/mapping_t5_complete_desc/mapping_prediction/output_with_abbreviation.txt
+++ b/train/mapping_t5_complete_desc/mapping_prediction/output_with_abbreviation.txt
@ -0,0 +1,6 @@
 Accuracy for fold 1: 0.9588263132986276
 Accuracy for fold 2: 0.9182242990654206
 Accuracy for fold 3: 0.9633534136546185
 Accuracy for fold 4: 0.9809705042816366
 Accuracy for fold 5: 0.8891433806688044
--- a/train/mapping_t5_complete_desc/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc/mapping_prediction/predict.py
@ -0,0 +1,73 @@
 import pandas as pd
 import os
 import glob
 from inference import Inference
 checkpoint_directory =  '../'
 BATCH_SIZE = 512
 def infer_and_select(fold):
    print(f"Inference for fold {fold}")
    # import test data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)
    # get target data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # processing to help with selection later
    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
    ##########################################
    # run inference
    # checkpoint
    # Use glob to find matching paths
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
    infer = Inference(checkpoint_path)
    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
    thing_prediction_list, property_prediction_list = infer.generate()
    # add labels too
    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
    # Convert the list to a Pandas DataFrame
    df_out = pd.DataFrame({
        'p_thing': thing_prediction_list, 
        'p_property': property_prediction_list
    })
    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
    df = pd.concat([df, df_out], axis=1)
    # we can save the t5 generation output here
    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
    # here we want to evaluate mapping accuracy within the valid in mdm data only
    in_mdm = df['MDM']
    condition_correct_thing = df['p_thing'] == df['thing']
    condition_correct_property = df['p_property'] == df['property']
    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm)
    pred_correct_proportion = prediction_mdm_correct/sum(in_mdm)
    # write output to file output.txt
    with open("output.txt", "a") as f:
        print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f)
 ###########################################  
 # Execute for all folds
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
 for fold in [1,2,3,4,5]:
    infer_and_select(fold)
--- a/train/mapping_t5_complete_desc/train.py
+++ b/train/mapping_t5_complete_desc/train.py
@ -0,0 +1,196 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from transformers import (
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    Seq2SeqTrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # outputs a list of dictionaries
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = f"<DESC>{row['tag_description']}<DESC>"
        element = {
            'input' : f"{desc}",
            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
        }
        output_list.append(element)
    return output_list
 def create_split_dataset(fold):
    # train 
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # valid
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
    })
    return combined_data
 # function to perform training for a given fold
 def train(fold):
    save_path = f'checkpoint_fold_{fold}'
    split_datasets = create_split_dataset(fold)
    # prepare tokenizer
    model_checkpoint = "t5-small"
    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['input']
        target = example['output']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            text_target=target, 
            max_length=max_length,
            truncation=True,
            padding=True
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns=split_datasets["train"].column_names,
    )
    # https://github.com/huggingface/transformers/pull/28414
    # model_checkpoint = "google/t5-efficient-tiny"
    # device_map set to auto to force it to load contiguous weights 
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    metric = evaluate.load("sacrebleu")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        # In case the model returns more than the prediction logits
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, 
                                            skip_special_tokens=False)
        # Replace -100s in the labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels,
                                                skip_special_tokens=False)
        # Remove <PAD> tokens from decoded predictions and labels
        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
        # Some simple post-processing
        # decoded_preds = [pred.strip() for pred in decoded_preds]
        # decoded_labels = [[label.strip()] for label in decoded_labels]
        # print(decoded_preds, decoded_labels)
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}
    # Generation Config
    # from transformers import GenerationConfig
    gen_config = model.generation_config
    gen_config.max_length = 64
    # compile
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # Trainer
    args = Seq2SeqTrainingArguments(
        f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
        generation_config=gen_config,
        remove_unused_columns=False,
    )
    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 for fold in [1,2,3,4,5]:
    print(fold)
    train(fold)
--- a/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt
@ -0,0 +1,6 @@
 Accuracy for fold 1: 0.9522006625650734
 Accuracy for fold 2: 0.9093457943925234
 Accuracy for fold 3: 0.9678714859437751
 Accuracy for fold 4: 0.9814462416745956
 Accuracy for fold 5: 0.890975721484196
--- a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py
@ -6,6 +6,8 @@ from inference import Inference
 checkpoint_directory =  '../'
 BATCH_SIZE = 512
 def infer_and_select(fold):
    print(f"Inference for fold {fold}")
    # import test data
@ -32,7 +34,7 @@ def infer_and_select(fold):
    infer = Inference(checkpoint_path)
-    infer.prepare_dataloader(df, batch_size=256, max_length=128)
+    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
    thing_prediction_list, property_prediction_list = infer.generate()
    # add labels too
--- a/train/mapping_t5_complete_desc_unit_name/.gitignore
+++ b/train/mapping_t5_complete_desc_unit_name/.gitignore
@ -0,0 +1,2 @@
 checkpoint*
 tensorboard-log
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/.gitignore
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/.gitignore
@ -0,0 +1,2 @@
 __pycache__
 exports/
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/inference.py
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/inference.py
@ -0,0 +1,169 @@
 import torch
 from torch.utils.data import DataLoader
 from transformers import (
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
 )
 import os
 from tqdm import tqdm
 from datasets import Dataset
 import numpy as np
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 class Inference():
    tokenizer: T5TokenizerFast
    model: torch.nn.Module
    dataloader: DataLoader
    def __init__(self, checkpoint_path):
        self._create_tokenizer()
        self._load_model(checkpoint_path)
    def _create_tokenizer(self):
        # %%
        # load tokenizer
        self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
        # Define additional special tokens
        additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
        # Add the additional special tokens to the tokenizer
        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    def _load_model(self, checkpoint_path: str):
        # load model
        # Define the directory and the pattern
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
        model = torch.compile(model)
        # set model to eval
        self.model = model.eval()
    def prepare_dataloader(self, input_df, batch_size, max_length):
        """
        *arguments*
        - input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
        - batch_size: the batch size of dataloader output
        - max_length: length of tokenizer output
        """
        print("preparing dataloader")
        # convert each dataframe row into a dictionary
        # outputs a list of dictionaries
        def _process_df(df):
            output_list = []
            for _, row in df.iterrows():
                name = f"<NAME>{row['tag_name']}<NAME>"
                desc = f"<DESC>{row['tag_description']}<DESC>"
                unit = f"<UNIT>{row['unit']}<UNIT>"
                element = {
                    'input' : f"{name}{desc}{unit}",
                    'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
                }
                output_list.append(element)
            return output_list
        def _preprocess_function(example):
            input = example['input']
            target = example['output']
            # text_target sets the corresponding label to inputs
            # there is no need to create a separate 'labels'
            model_inputs = self.tokenizer(
                input,
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
                padding='max_length',
                truncation=True,
            )
            return model_inputs
        test_dataset = Dataset.from_list(_process_df(input_df))
        # map maps function to each "row" in the dataset
        # aka the data in the immediate nesting
        datasets = test_dataset.map(
            _preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=test_dataset.column_names,
        )
        # datasets = _preprocess_function(test_dataset)
        datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        # create dataloader
        self.dataloader = DataLoader(datasets, batch_size=batch_size)
    def generate(self):
        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128
        pred_generations = []
        pred_labels = []
        print("start generation")
        for batch in tqdm(self.dataloader):
            # Inference in batches
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            # save labels too
            pred_labels.extend(batch['labels'])
            # Move to GPU if available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            self.model.to(device)
            # Perform inference
            with torch.no_grad():
                outputs = self.model.generate(input_ids,
                                        attention_mask=attention_mask,
                                        max_length=MAX_GENERATE_LENGTH)
                # Decode the output and print the results
                pred_generations.extend(outputs.to("cpu"))
        # %%
        # extract sequence and decode
        def extract_seq(tokens, start_value, end_value):
            if start_value not in tokens or end_value not in tokens:
                return None  # Or handle this case according to your requirements
            start_id = np.where(tokens == start_value)[0][0]
            end_id = np.where(tokens == end_value)[0][0]
            return tokens[start_id+1:end_id]
        def process_tensor_output(tokens):
            thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
            property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
            p_thing = None
            p_property = None
            if (thing_seq is not None):
                p_thing =  self.tokenizer.decode(thing_seq, skip_special_tokens=False)
            if (property_seq is not None):
                p_property =  self.tokenizer.decode(property_seq, skip_special_tokens=False)
            return p_thing, p_property
        # decode prediction labels
        def decode_preds(tokens_list):
            thing_prediction_list = []
            property_prediction_list = []
            for tokens in tokens_list:
                p_thing, p_property = process_tensor_output(tokens)
                thing_prediction_list.append(p_thing)
                property_prediction_list.append(p_property)
            return thing_prediction_list, property_prediction_list 
        thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
        return thing_prediction_list, property_prediction_list
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/output.txt
@ -0,0 +1,6 @@
 Accuracy for fold 1: 0.9465215333648841
 Accuracy for fold 2: 0.9102803738317757
 Accuracy for fold 3: 0.9728915662650602
 Accuracy for fold 4: 0.9843006660323501
 Accuracy for fold 5: 0.8996793403573065
--- a/train/mapping_t5_complete_desc_unit_name/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc_unit_name/mapping_prediction/predict.py
@ -0,0 +1,73 @@
 import pandas as pd
 import os
 import glob
 from inference import Inference
 checkpoint_directory =  '../'
 BATCH_SIZE = 512
 def infer_and_select(fold):
    print(f"Inference for fold {fold}")
    # import test data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)
    # get target data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # processing to help with selection later
    train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
    ##########################################
    # run inference
    # checkpoint
    # Use glob to find matching paths
    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
    pattern = 'checkpoint-*'
    checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
    infer = Inference(checkpoint_path)
    infer.prepare_dataloader(df, batch_size=BATCH_SIZE, max_length=128)
    thing_prediction_list, property_prediction_list = infer.generate()
    # add labels too
    # thing_actual_list, property_actual_list = decode_preds(pred_labels)
    # Convert the list to a Pandas DataFrame
    df_out = pd.DataFrame({
        'p_thing': thing_prediction_list, 
        'p_property': property_prediction_list
    })
    # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
    # df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
    df = pd.concat([df, df_out], axis=1)
    # we can save the t5 generation output here
    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
    # here we want to evaluate mapping accuracy within the valid in mdm data only
    in_mdm = df['MDM']
    condition_correct_thing = df['p_thing'] == df['thing']
    condition_correct_property = df['p_property'] == df['property']
    prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm)
    pred_correct_proportion = prediction_mdm_correct/sum(in_mdm)
    # write output to file output.txt
    with open("output.txt", "a") as f:
        print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f)
 ###########################################  
 # Execute for all folds
 # reset file before writing to it
 with open("output.txt", "w") as f:
    print('', file=f)
 for fold in [1,2,3,4,5]:
    infer_and_select(fold)
--- a/train/mapping_t5_complete_desc_unit_name/train.py
+++ b/train/mapping_t5_complete_desc_unit_name/train.py
@ -0,0 +1,198 @@
 # %%
 # from datasets import load_from_disk
 import os
 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 import torch
 from transformers import (
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    Seq2SeqTrainingArguments
 )
 import evaluate
 import numpy as np
 import pandas as pd
 # import matplotlib.pyplot as plt
 from datasets import Dataset, DatasetDict
 torch.set_float32_matmul_precision('high')
 # outputs a list of dictionaries
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        name = f"<NAME>{row['tag_name']}<NAME>"
        desc = f"<DESC>{row['tag_description']}<DESC>"
        unit = f"<UNIT>{row['unit']}<UNIT>"
        element = {
            'input' : f"{name}{desc}{unit}",
            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
        }
        output_list.append(element)
    return output_list
 def create_split_dataset(fold):
    # train 
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
    train_df = pd.read_csv(data_path, skipinitialspace=True)
    # valid
    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
    validation_df = pd.read_csv(data_path, skipinitialspace=True)
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df_to_dict(train_df)),
        'validation' : Dataset.from_list(process_df_to_dict(validation_df)),
    })
    return combined_data
 # function to perform training for a given fold
 def train(fold):
    save_path = f'checkpoint_fold_{fold}'
    split_datasets = create_split_dataset(fold)
    # prepare tokenizer
    model_checkpoint = "t5-small"
    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
    # Add the additional special tokens to the tokenizer
    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
    max_length = 120
    # given a dataset entry, run it through the tokenizer
    def preprocess_function(example):
        input = example['input']
        target = example['output']
        # text_target sets the corresponding label to inputs
        # there is no need to create a separate 'labels'
        model_inputs = tokenizer(
            input,
            text_target=target, 
            max_length=max_length,
            truncation=True,
            padding=True
        )
        return model_inputs
    # map maps function to each "row" in the dataset
    # aka the data in the immediate nesting
    tokenized_datasets = split_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=8,
        remove_columns=split_datasets["train"].column_names,
    )
    # https://github.com/huggingface/transformers/pull/28414
    # model_checkpoint = "google/t5-efficient-tiny"
    # device_map set to auto to force it to load contiguous weights 
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    metric = evaluate.load("sacrebleu")
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        # In case the model returns more than the prediction logits
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, 
                                            skip_special_tokens=False)
        # Replace -100s in the labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels,
                                                skip_special_tokens=False)
        # Remove <PAD> tokens from decoded predictions and labels
        decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds]
        decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels]
        # Some simple post-processing
        # decoded_preds = [pred.strip() for pred in decoded_preds]
        # decoded_labels = [[label.strip()] for label in decoded_labels]
        # print(decoded_preds, decoded_labels)
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}
    # Generation Config
    # from transformers import GenerationConfig
    gen_config = model.generation_config
    gen_config.max_length = 64
    # compile
    # model = torch.compile(model, backend="inductor", dynamic=True)
    # Trainer
    args = Seq2SeqTrainingArguments(
        f"{save_path}",
        # eval_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=40,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
        generation_config=gen_config,
        remove_unused_columns=False,
    )
    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    # uncomment to load training from checkpoint
    # checkpoint_path = 'default_40_1/checkpoint-5600'
    # trainer.train(resume_from_checkpoint=checkpoint_path)
    trainer.train()
 # execute training
 for fold in [1,2,3,4,5]:
    print(fold)
    train(fold)
--- a/train/predict.bash
+++ b/train/predict.bash
@ -0,0 +1,27 @@
 #!/bin/bash
 cd classification_bert_complete_desc/classification_prediction/
 micromamba run -n hug python predict.py
 cd ../..
 cd classification_bert_complete_desc_unit/classification_prediction/
 micromamba run -n hug python predict.py
 cd ../..
 cd classification_bert_complete_desc_unit_name/classification_prediction/
 micromamba run -n hug python predict.py
 cd ../..
 # cd mapping_t5_complete_desc/mapping_prediction/
 # micromamba run -n hug python predict.py
 # cd ../..
 # 
 # cd mapping_t5_complete_desc_unit/mapping_prediction/
 # micromamba run -n hug python predict.py
 # cd ../..
 # 
 # cd mapping_t5_complete_desc_unit_name/mapping_prediction/
 # micromamba run -n hug python predict.py
 # cd ../..
--- a/train/train.bash
+++ b/train/train.bash
@ -0,0 +1,25 @@
 #!/bin/bash
 # cd classification_bert_complete_desc
 # micromamba run -n hug accelerate launch train.py
 # cd ..
 # 
 # cd classification_bert_complete_desc_unit
 # micromamba run -n hug accelerate launch train.py
 # cd ..
 cd classification_bert_complete_desc_unit_name
 micromamba run -n hug accelerate launch train.py
 cd ..
 # cd mapping_t5_complete_desc
 # micromamba run -n hug accelerate launch train.py
 # cd ..
 # 
 # cd mapping_t5_complete_desc_unit
 # micromamba run -n hug accelerate launch train.py
 # cd ..
 # 
 # cd mapping_t5_complete_name_desc_unit
 # micromamba run -n hug accelerate launch train.py
 # cd ..