diff --git a/analysis/.gitignore b/analysis/.gitignore new file mode 100644 index 0000000..a044ac0 --- /dev/null +++ b/analysis/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +*.html diff --git a/analysis/find_closest.py b/analysis/find_closest.py new file mode 100644 index 0000000..e3552dd --- /dev/null +++ b/analysis/find_closest.py @@ -0,0 +1,297 @@ + +# %% +import pandas as pd +from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np + +# %% +data_path = f'../data_preprocess/exports/preprocessed_data.csv' +df_pre = pd.read_csv(data_path, skipinitialspace=True) + +# %% +# this should be >0 if we are using abbreviations processed data +desc_list = df_pre['tag_description'].to_list() + +# %% +[ elem for elem in desc_list if isinstance(elem, float)] +########################################## +# %% +fold = 1 +data_path = f'../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv' +df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +# subset to mdm +df = df[df['MDM']] + +thing_condition = df['p_thing'] == df['thing_pattern'] +error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']] + +property_condition = df['p_property'] == df['property_pattern'] +error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']] + +correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']] + +test_df = df + +# %% +# thing_df.to_html('thing_errors.html') +# property_df.to_html('property_errors.html') + +########################################## +# what we need now is understand why the model is making these mispredictions +# import train data and test data +# %% +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + # name = f"{row['tag_name']}" + desc = f"{row['tag_description']}" + # element = f"{name}{desc}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_mean_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + + + +# %% +data_path = f"../data_preprocess/exports/dataset/group_{fold}/train.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) + +checkpoint_directory = "../train/mapping_pattern" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +train_embedder = Embedder(input_df=train_df) +train_embeds = train_embedder.make_embedding(checkpoint_path) + +test_embedder = Embedder(input_df=test_df) +test_embeds = test_embedder.make_embedding(checkpoint_path) + + + +# %% +# test embeds are inputs since we are looking back at train data +cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() + +# %% +# the following function takes in a full cos_sim_matrix +# condition_source: boolean selectors of the source embedding +# condition_target: boolean selectors of the target embedding +def find_closest(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top 5 maximum values along axis 1 + top_k = 5 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # note that top_k_indices is a nested list because of the 2d nature of the matrix + # the result is flipped + top_k_indices[0] = top_k_indices[0][::-1] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + + return top_k_indices, top_k_values + +# %% +error_thing_df.index + +#################################################### +# special find-back code +# %% +def find_back_element_with_print(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + + top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list() + + test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list() + predicted_test_data = test_df[test_df.index == select_idx]['p_thing'] + test_df[test_df.index == select_idx]['p_property'] + + print("*" * 80) + print("idx:", select_idx) + print(training_data_pattern_list) + print(test_data_pattern_list) + print(predicted_test_data) + + test_pattern = test_data_pattern_list[0] + + find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ] + + if sum(find_back_list) > 0: + return True + else: + return False + +find_back_element_with_print(2884) + +# %% +def find_back_element(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + + top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + training_data_pattern_list = train_df.iloc[top_k_indices[0]]['pattern'].to_list() + + test_data_pattern_list = test_df[test_df.index == select_idx]['pattern'].to_list() + + # print(training_data_pattern_list) + # print(test_data_pattern_list) + + test_pattern = test_data_pattern_list[0] + + find_back_list = [ test_pattern in pattern for pattern in training_data_pattern_list ] + + if sum(find_back_list) > 0: + return True + else: + return False + +find_back_element(2884) + + + +# %% +# for error thing +pattern_in_train = [] +for select_idx in error_thing_df.index: + result = find_back_element_with_print(select_idx) + print("status:", result) + pattern_in_train.append(result) + +# %% +sum(pattern_in_train)/len(pattern_in_train) + +### +# for error property +# %% +pattern_in_train = [] +for select_idx in error_property_df.index: + result = find_back_element(select_idx) + pattern_in_train.append(result) + +# %% +sum(pattern_in_train)/len(pattern_in_train) + + +#################################################### + +# %% +# make function to compute similarity of closest retrieved result +def compute_similarity(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + return np.mean(top_k_values[0]) + +# %% +def print_summary(similarity_scores): + # Convert list to numpy array for additional stats + np_array = np.array(similarity_scores) + + # Get stats + mean_value = np.mean(np_array) + percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles + + # Display numpy results + print("Mean:", mean_value) + print("25th, 50th, 75th Percentiles:", percentiles) + + +# %% +########################################## +# Analyze the degree of similarity differences between correct and incorrect results + +# %% +# compute similarity scores for all values in error_thing_df +similarity_thing_scores = [] +for idx in error_thing_df.index: + similarity_thing_scores.append(compute_similarity(idx)) +print_summary(similarity_thing_scores) + + +# %% +similarity_property_scores = [] +for idx in error_property_df.index: + similarity_property_scores.append(compute_similarity(idx)) +print_summary(similarity_property_scores) + +# %% +similarity_correct_scores = [] +for idx in correct_df.index: + similarity_correct_scores.append(compute_similarity(idx)) +print_summary(similarity_correct_scores) + + + +# %% +import matplotlib.pyplot as plt + +# Sample data +list1 = similarity_thing_scores +list2 = similarity_property_scores +list3 = similarity_correct_scores + +# Plot histograms +bins = 50 +plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True) +plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True) +plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True) + +# Labels and legend +plt.xlabel('Value') +plt.ylabel('Frequency') +plt.legend(loc='upper right') +plt.title('Histograms of Three Lists') + +# Show plot +plt.show() + +########################################### +# %% +# why do similarities of 97% still map correctly? +score_array = np.array(similarity_correct_scores) +# %% +sum(score_array < 0.95) +# %% +correct_df[score_array < 0.95]['tag_description'].index.to_list() +# %% \ No newline at end of file diff --git a/analysis/mapping_error.py b/analysis/mapping_error.py new file mode 100644 index 0000000..c9ec9d1 --- /dev/null +++ b/analysis/mapping_error.py @@ -0,0 +1,227 @@ +# %% +import pandas as pd +from utils import Retriever, cosine_similarity_chunked +import os +import glob +import numpy as np + +# %% +data_path = f'../data_preprocess/exports/preprocessed_data.csv' +df_pre = pd.read_csv(data_path, skipinitialspace=True) + +# %% +# this should be >0 if we are using abbreviations processed data +desc_list = df_pre['tag_description'].to_list() + +# %% +[ elem for elem in desc_list if isinstance(elem, float)] +########################################## +# %% +fold = 1 +data_path = f'../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv' +df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +# subset to mdm +df = df[df['MDM']] + +thing_condition = df['p_thing'] == df['thing_pattern'] +error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']] + +property_condition = df['p_property'] == df['property_pattern'] +error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']] + +correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']] + +test_df = df + +# %% +# thing_df.to_html('thing_errors.html') +# property_df.to_html('property_errors.html') + +########################################## +# what we need now is understand why the model is making these mispredictions +# import train data and test data +# %% +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + # name = f"{row['tag_name']}" + desc = f"{row['tag_description']}" + # element = f"{name}{desc}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_mean_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + + + +# %% +data_path = f"../data_preprocess/exports/dataset/group_{fold}/train.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) + +checkpoint_directory = "../train/mapping_pattern" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +train_embedder = Embedder(input_df=train_df) +train_embeds = train_embedder.make_embedding(checkpoint_path) + +test_embedder = Embedder(input_df=test_df) +test_embeds = test_embedder.make_embedding(checkpoint_path) + + + +# %% +# test embeds are inputs since we are looking back at train data +cos_sim_matrix = cosine_similarity_chunked(test_embeds, train_embeds, chunk_size=8).cpu().numpy() + +# %% +# the following function takes in a full cos_sim_matrix +# condition_source: boolean selectors of the source embedding +# condition_target: boolean selectors of the target embedding +def find_closest(cos_sim_matrix, condition_source, condition_target): + # subset_matrix = cos_sim_matrix[condition_source] + # except we are subsetting 2D matrix (row, column) + subset_matrix = cos_sim_matrix[np.ix_(condition_source, condition_target)] + # we select top k here + # Get the indices of the top 5 maximum values along axis 1 + top_k = 5 + top_k_indices = np.argsort(subset_matrix, axis=1)[:, -top_k:] # Get indices of top k values + # note that top_k_indices is a nested list because of the 2d nature of the matrix + # the result is flipped + top_k_indices[0] = top_k_indices[0][::-1] + + # Get the values of the top 5 maximum scores + top_k_values = np.take_along_axis(subset_matrix, top_k_indices, axis=1) + + + return top_k_indices, top_k_values + +# %% +error_thing_df.index + +#################################################### +# special find-back code +# %% +select_idx = 2884 +condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] +condition_target = np.ones(train_embeds.shape[0], dtype=bool) + +top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + +print(top_k_indices) +print(top_k_values) +# %% +train_df.iloc[top_k_indices[0]] +# %% +test_df[test_df.index == select_idx] +#################################################### + +# %% +# make function to compute similarity of closest retrieved result +def compute_similarity(select_idx): + condition_source = test_df['tag_description'] == test_df[test_df.index == select_idx]['tag_description'].tolist()[0] + condition_target = np.ones(train_embeds.shape[0], dtype=bool) + top_k_indices, top_k_values = find_closest( + cos_sim_matrix=cos_sim_matrix, + condition_source=condition_source, + condition_target=condition_target) + + return np.mean(top_k_values[0]) + +# %% +def print_summary(similarity_scores): + # Convert list to numpy array for additional stats + np_array = np.array(similarity_scores) + + # Get stats + mean_value = np.mean(np_array) + percentiles = np.percentile(np_array, [25, 50, 75]) # 25th, 50th, and 75th percentiles + + # Display numpy results + print("Mean:", mean_value) + print("25th, 50th, 75th Percentiles:", percentiles) + + +# %% +########################################## +# Analyze the degree of similarity differences between correct and incorrect results + +# %% +# compute similarity scores for all values in error_thing_df +similarity_thing_scores = [] +for idx in error_thing_df.index: + similarity_thing_scores.append(compute_similarity(idx)) +print_summary(similarity_thing_scores) + + +# %% +similarity_property_scores = [] +for idx in error_property_df.index: + similarity_property_scores.append(compute_similarity(idx)) +print_summary(similarity_property_scores) + +# %% +similarity_correct_scores = [] +for idx in correct_df.index: + similarity_correct_scores.append(compute_similarity(idx)) +print_summary(similarity_correct_scores) + + + +# %% +import matplotlib.pyplot as plt + +# Sample data +list1 = similarity_thing_scores +list2 = similarity_property_scores +list3 = similarity_correct_scores + +# Plot histograms +bins = 50 +plt.hist(list1, bins=bins, alpha=0.5, label='List 1', density=True) +plt.hist(list2, bins=bins, alpha=0.5, label='List 2', density=True) +plt.hist(list3, bins=bins, alpha=0.5, label='List 3', density=True) + +# Labels and legend +plt.xlabel('Value') +plt.ylabel('Frequency') +plt.legend(loc='upper right') +plt.title('Histograms of Three Lists') + +# Show plot +plt.show() + +########################################### +# %% +# why do similarities of 97% still map correctly? +score_array = np.array(similarity_correct_scores) +# %% +sum(score_array < 0.95) +# %% +correct_df[score_array < 0.95]['tag_description'].index.to_list() +# %% \ No newline at end of file diff --git a/analysis/utils.py b/analysis/utils.py new file mode 100644 index 0000000..12a0ac5 --- /dev/null +++ b/analysis/utils.py @@ -0,0 +1,75 @@ +import torch +from transformers import AutoTokenizer +from transformers import AutoModelForSeq2SeqLM +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_mean_embedding(self, batch_size=32): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=16): + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + # Expand batch1 chunk and entire batch2 for comparison + batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + + # Compute cosine similarity for the chunk and store it in the final tensor + cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + return cos_sim + diff --git a/data_preprocess/abbreviations/abbreviations_replacer.py b/data_preprocess/abbreviations/abbreviations_replacer.py index 2a40b27..06b5a79 100644 --- a/data_preprocess/abbreviations/abbreviations_replacer.py +++ b/data_preprocess/abbreviations/abbreviations_replacer.py @@ -32,7 +32,11 @@ df = pd.read_csv(file_path) # %% # Replace abbreviations print("running substitution") -tag_descriptions = df['tag_description'].fillna("N/A") +df['tag_description']= df['tag_description'].fillna("NOVALUE") +# Replace whitespace-only entries with "NOVALUE" +# note that "N/A" can be read as nan +df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True) +tag_descriptions = df['tag_description'] replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict) # print("Descriptions after replacement:", replaced_descriptions) @@ -40,4 +44,3 @@ replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict df["tag_description"] = replaced_descriptions df.to_csv("../exports/preprocessed_data.csv", index=False) print("file saved") -# %% diff --git a/train/classification_all/train.py b/train/classification_all/train.py index 8f4d5e8..4e313d9 100644 --- a/train/classification_all/train.py +++ b/train/classification_all/train.py @@ -101,6 +101,8 @@ import torch import torch.nn as nn import torch.optim as optim +torch.set_float32_matmul_precision('high') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -128,7 +130,6 @@ output_dim = 203 # 202 classes + 1 out model = NeuralNet(input_dim, output_dim) model = torch.compile(model) model = model.to(device) -torch.set_float32_matmul_precision('high') # %% from torch.utils.data import DataLoader, TensorDataset diff --git a/train/classification_all_with_contrastive/.gitignore b/train/classification_all_with_contrastive/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/train/classification_all_with_contrastive/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/train/classification_all_with_contrastive/train.py b/train/classification_all_with_contrastive/train.py new file mode 100644 index 0000000..d860c31 --- /dev/null +++ b/train/classification_all_with_contrastive/train.py @@ -0,0 +1,448 @@ +# %% +import pandas as pd +import numpy as np +from typing import List +from tqdm import tqdm +from utils import Retriever, cosine_similarity_chunked +import glob +import os + +# import re +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm import tqdm +import random +import math + + +# %% +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + # name = f"{row['tag_name']}" + desc = f"{row['tag_description']}" + # element = f"{name}{desc}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_mean_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + +# %% +# input data +fold = 1 +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +test_df = pd.read_csv(data_path, skipinitialspace=True) +ships_list = list(set(test_df['ships_idx'])) + +# %% +data_path = '../../data_preprocess/exports/preprocessed_data.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +train_df = full_df[~full_df['ships_idx'].isin(ships_list)] + +# %% +checkpoint_directory = "../../train/baseline" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +train_embedder = Embedder(input_df=train_df) +train_embeds = train_embedder.make_embedding(checkpoint_path) + + +# %% +train_embeds.shape + +# %% +# now we need to generate the class labels +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +# based on the mdm_labels, we assign a value to the dataframe +def generate_labels(df, mdm_list): + label_list = [] + for _, row in df.iterrows(): + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + label_list.append(index + 1) + except ValueError: + label_list.append(0) + + return label_list + +# %% +label_list = generate_labels(train_df, mdm_list) + +# # %% +# from collections import Counter +# +# frequency = Counter(label_list) +# frequency + +#################################################### +# %% +# we can start contrastive learning on a re-projection layer for the embedding +################################################# +# MARK: start collaborative filtering + +# we need to create a batch where half are positive examples and the other half +# is negative examples + +# we first need to test out how we can get the embeddings of each ship + +# %% +label_tensor = torch.asarray(label_list) + +def create_pairs(all_embeddings, labels, batch_size): + positive_pairs = [] + negative_pairs = [] + + # find unique ships labels + unique_labels = torch.unique(labels) + + embeddings_by_label = {} + for label in unique_labels: + embeddings_by_label[label.item()] = all_embeddings[labels == label] + + # create positive pairs from the same ship + for _ in range(batch_size // 2): + label = random.choice(unique_labels) + label_embeddings = embeddings_by_label[label.item()] + + # randomly select 2 embeddings from the same ship + if len(label_embeddings) >= 2: # ensure that we can choose + emb1, emb2 = random.sample(list(label_embeddings), 2) + positive_pairs.append((emb1, emb2, torch.tensor(1.0))) + + # create negative pairs (from different ships) + for _ in range(batch_size // 2): + label1, label2 = random.sample(list(unique_labels), 2) + + # select one embedding from each ship + emb1 = random.choice(embeddings_by_label[label1.item()]) + emb2 = random.choice(embeddings_by_label[label2.item()]) + + negative_pairs.append((emb1, emb2, torch.tensor(0.0))) + + pairs = positive_pairs + negative_pairs + + # separate embeddings and labels for the batch + emb1_batch = torch.stack([pair[0] for pair in pairs]) + emb2_batch = torch.stack([pair[1] for pair in pairs]) + labels_batch = torch.stack([pair[2] for pair in pairs]) + + return emb1_batch, emb2_batch, labels_batch + + +# # %% +# # demo of batch creation +# emb1_batch, emb2_batch, labels = create_pairs( +# train_embed, +# ship_labels, +# 64 +# ) +# %% +# create model + +class linear_map(nn.Module): + def __init__(self, input_dim, output_dim): + super(linear_map, self).__init__() + self.linear_1 = nn.Linear(input_dim, 512) + self.linear_2 = nn.Linear(512, output_dim) + self.relu = nn.ReLU() # Non-linearity + + def forward(self, x): + x = self.linear_1(x) + x = self.relu(x) + x = self.linear_2(x) + return x + + +# %% +# the contrastive loss +# def contrastive_loss(embedding1, embedding2, label, margin=1.0): +# # calculate euclidean distance +# distance = F.pairwise_distance(embedding1, embedding2) +# +# # loss for positive pairs +# # label will select on positive examples +# positive_loss = label * torch.pow(distance, 2) +# +# # loss for negative pairs +# negative_loss = (1 - label) * torch.pow(torch.clamp(margin - distance, min=0), 2) +# +# loss = torch.mean(positive_loss + negative_loss) +# return loss + + +def contrastive_loss_cosine(embeddings1, embeddings2, label, margin=0.5): + """ + Compute the contrastive loss using cosine similarity. + + Args: + - embeddings1: Tensor of embeddings for one set of pairs, shape (batch_size, embedding_size) + - embeddings2: Tensor of embeddings for the other set of pairs, shape (batch_size, embedding_size) + - label: Tensor of labels, 1 for positive pairs (same class), 0 for negative pairs (different class) + - margin: Margin for negative pairs (default 0.5) + + Returns: + - loss: Contrastive loss based on cosine similarity + """ + # Cosine similarity between the two sets of embeddings + cosine_sim = F.cosine_similarity(embeddings1, embeddings2) + + # For positive pairs, we want the cosine similarity to be close to 1 + positive_loss = label * (1 - cosine_sim) + + # For negative pairs, we want the cosine similarity to be lower than the margin + negative_loss = (1 - label) * F.relu(cosine_sim - margin) + + # Combine the two losses + loss = positive_loss + negative_loss + + # Return the average loss across the batch + return loss.mean() +# %% +# training loop +num_epochs = 50 +batch_size = 512 +train_data_size = train_embeds.shape[0] +output_dim = 512 +learning_rate = 1e-5 +steps_per_epoch = math.ceil(train_data_size / batch_size) + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +torch.set_float32_matmul_precision('high') + +linear_model = linear_map( + input_dim=train_embeds.shape[-1], + output_dim=output_dim) + +linear_model = torch.compile(linear_model) +linear_model.to(device) + +optimizer = torch.optim.Adam(linear_model.parameters(), lr=learning_rate) + +# %% + +for epoch in tqdm(range(num_epochs)): + with tqdm(total=steps_per_epoch, desc=f"Epoch {epoch+1}/{num_epochs}") as pbar: + for _ in range(steps_per_epoch): + emb1_batch, emb2_batch, labels_batch = create_pairs( + train_embeds, + label_tensor, + batch_size + ) + output1 = linear_model(emb1_batch.to(device)) + output2 = linear_model(emb2_batch.to(device)) + + loss = contrastive_loss_cosine(output1, output2, labels_batch.to(device), margin=0.7) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # if epoch % 10 == 0: + # print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}") + pbar.set_postfix({'loss': loss.item()}) + pbar.update(1) + + +# %% +# apply the re-projection layer to achieve better classification +# new_embeds = for loop of model on old embeds + +# we have to transform our previous embeddings into mapped embeddings +def predict_batch(embeds, model, batch_size): + output_list = [] + with torch.no_grad(): + for i in range(0, len(embeds), batch_size): + batch_embed = embeds[i:i+batch_size] + output = model(batch_embed.to(device)) + output_list.append(output) + + all_embeddings = torch.cat(output_list, dim=0) + return all_embeddings + +train_remap_embeds = predict_batch(train_embeds, linear_model, 32) + + +#################################################### +# %% +# we can start classifying + +# %% +import torch +import torch.nn as nn +import torch.optim as optim + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Define the neural network with non-linearity +class NeuralNet(nn.Module): + def __init__(self, input_dim, output_dim): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_dim, 512) # First layer (input to hidden) + self.relu = nn.ReLU() # Non-linearity + self.fc2 = nn.Linear(512, 256) # Output layer + self.fc3 = nn.Linear(256, output_dim) + + def forward(self, x): + out = self.fc1(x) # Input to hidden + out = self.relu(out) # Apply non-linearity + out = self.fc2(out) # Hidden to output + out = self.relu(out) + out = self.fc3(out) + return out + +# Example usage +input_dim = 512 # Example input dimension (adjust based on your mean embedding size) +output_dim = 203 # 202 classes + 1 out + +model = NeuralNet(input_dim, output_dim) +model = torch.compile(model) +model = model.to(device) +torch.set_float32_matmul_precision('high') + +# %% +from torch.utils.data import DataLoader, TensorDataset + +# we use the re-projected embeds +mean_embeddings = train_remap_embeds +# labels = torch.randint(0, 2, (1000,)) # Random binary labels (0 for OOD, 1 for ID) + +train_labels = generate_labels(train_df, mdm_list) +labels = torch.tensor(train_labels) + +# Create a dataset and DataLoader +dataset = TensorDataset(mean_embeddings, labels) +dataloader = DataLoader(dataset, batch_size=256, shuffle=True) +# %% +# Define loss function and optimizer +# criterion = nn.BCELoss() # Binary cross entropy loss +# criterion = nn.BCEWithLogitsLoss() +criterion = nn.CrossEntropyLoss() +optimizer = optim.Adam(model.parameters(), lr=1e-4) + +# Define the scheduler + + +# Training loop +num_epochs = 200 # Adjust as needed + + +# Define the lambda function for linear decay +# It should return the multiplier for the learning rate (starts at 1.0 and goes to 0) +def linear_decay(epoch): + return 1 - epoch / num_epochs + +scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=linear_decay) + +for epoch in range(num_epochs): + model.train() + running_loss = 0.0 + for inputs, targets in dataloader: + # Forward pass + inputs = inputs.to(device) + targets = targets.to(device) + outputs = model(inputs) + # loss = criterion(outputs.squeeze(), targets.float().squeeze()) # Ensure the target is float + loss = criterion(outputs, targets) + + # Backward pass and optimization + optimizer.zero_grad() + loss.backward() + optimizer.step() + + + running_loss += loss.item() + + + scheduler.step() + + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(dataloader)}") + + + +# %% +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +test_df = pd.read_csv(data_path, skipinitialspace=True) + +checkpoint_directory = "../../train/baseline" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +test_embedder = Embedder(input_df=test_df) +test_embeds = test_embedder.make_embedding(checkpoint_path) + +test_remap_embeds = predict_batch(test_embeds, linear_model, 32) + +test_labels = generate_labels(test_df, mdm_list) +# %% +mean_embeddings = test_remap_embeds +labels = torch.tensor(test_labels) +dataset = TensorDataset(mean_embeddings, labels) +dataloader = DataLoader(dataset, batch_size=64, shuffle=False) + +model.eval() +output_classes = [] +output_probs = [] +for inputs, _ in dataloader: + with torch.no_grad(): + inputs = inputs.to(device) + logits = model(inputs) + probabilities = torch.softmax(logits, dim=1) + # predicted_classes = torch.argmax(probabilities, dim=1) + max_probabilities, predicted_classes = torch.max(probabilities, dim=1) + output_classes.extend(predicted_classes.to('cpu').numpy()) + output_probs.extend(max_probabilities.to('cpu').numpy()) + + +# %% +# evaluation +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +y_true = test_labels +y_pred = output_classes + +# Compute metrics +accuracy = accuracy_score(y_true, y_pred) +f1 = f1_score(y_true, y_pred, average='macro') +precision = precision_score(y_true, y_pred, average='macro') +recall = recall_score(y_true, y_pred, average='macro') + +# Print the results +print(f'Accuracy: {accuracy:.2f}') +print(f'F1 Score: {f1:.2f}') +print(f'Precision: {precision:.2f}') +print(f'Recall: {recall:.2f}') + + +# %% diff --git a/train/classification_all_with_contrastive/utils.py b/train/classification_all_with_contrastive/utils.py new file mode 100644 index 0000000..12a0ac5 --- /dev/null +++ b/train/classification_all_with_contrastive/utils.py @@ -0,0 +1,75 @@ +import torch +from transformers import AutoTokenizer +from transformers import AutoModelForSeq2SeqLM +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_mean_embedding(self, batch_size=32): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=16): + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + # Expand batch1 chunk and entire batch2 for comparison + batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + + # Compute cosine similarity for the chunk and store it in the final tensor + cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + return cos_sim + diff --git a/train/classification_mdm_with_contrastive/.gitignore b/train/classification_mdm_with_contrastive/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/train/classification_mdm_with_contrastive/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/train/classification_mdm_with_contrastive/train.py b/train/classification_mdm_with_contrastive/train.py new file mode 100644 index 0000000..d36c9c5 --- /dev/null +++ b/train/classification_mdm_with_contrastive/train.py @@ -0,0 +1,450 @@ +# %% +import pandas as pd +import numpy as np +from typing import List +from tqdm import tqdm +from utils import Retriever, cosine_similarity_chunked +import glob +import os + +# import re +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm import tqdm +import random +import math + + +# %% +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + # name = f"{row['tag_name']}" + desc = f"{row['tag_description']}" + # element = f"{name}{desc}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_mean_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + +# %% +# input data +fold = 1 +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv" +train_df = pd.read_csv(data_path, skipinitialspace=True) + +# %% +checkpoint_directory = "../../train/baseline" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +train_embedder = Embedder(input_df=train_df) +train_embeds = train_embedder.make_embedding(checkpoint_path) + + +# %% +train_embeds.shape + +# %% +# now we need to generate the class labels +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +# based on the mdm_labels, we assign a value to the dataframe +def generate_labels(df, mdm_list): + label_list = [] + for _, row in df.iterrows(): + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + label_list.append(index) + except ValueError: + label_list.append(-1) + + return label_list + +# %% +label_list = generate_labels(train_df, mdm_list) + +# # %% +# from collections import Counter +# +# frequency = Counter(label_list) +# frequency + +#################################################### +# %% +# we can start contrastive learning on a re-projection layer for the embedding +################################################# +# MARK: start collaborative filtering + +# we need to create a batch where half are positive examples and the other half +# is negative examples + +# we first need to test out how we can get the embeddings of each ship + +# %% +label_tensor = torch.asarray(label_list) + +def create_pairs(all_embeddings, labels, batch_size): + positive_pairs = [] + negative_pairs = [] + + # find unique ships labels + unique_labels = torch.unique(labels) + + embeddings_by_label = {} + for label in unique_labels: + embeddings_by_label[label.item()] = all_embeddings[labels == label] + + # create positive pairs from the same ship + for _ in range(batch_size // 2): + label = random.choice(unique_labels) + label_embeddings = embeddings_by_label[label.item()] + + # randomly select 2 embeddings from the same ship + if len(label_embeddings) >= 2: # ensure that we can choose + emb1, emb2 = random.sample(list(label_embeddings), 2) + positive_pairs.append((emb1, emb2, torch.tensor(1.0))) + + # create negative pairs (from different ships) + for _ in range(batch_size // 2): + label1, label2 = random.sample(list(unique_labels), 2) + + # select one embedding from each ship + emb1 = random.choice(embeddings_by_label[label1.item()]) + emb2 = random.choice(embeddings_by_label[label2.item()]) + + negative_pairs.append((emb1, emb2, torch.tensor(0.0))) + + pairs = positive_pairs + negative_pairs + + # separate embeddings and labels for the batch + emb1_batch = torch.stack([pair[0] for pair in pairs]) + emb2_batch = torch.stack([pair[1] for pair in pairs]) + labels_batch = torch.stack([pair[2] for pair in pairs]) + + return emb1_batch, emb2_batch, labels_batch + + +# %% +# create model + +class linear_map(nn.Module): + def __init__(self, input_dim, output_dim): + super(linear_map, self).__init__() + self.linear_1 = nn.Linear(input_dim, output_dim) + # self.linear_2 = nn.Linear(512, output_dim) + # self.relu = nn.ReLU() # Non-linearity + + def forward(self, x): + x = self.linear_1(x) + # x = self.relu(x) + # x = self.linear_2(x) + return x + + +# %% +# the contrastive loss +# def contrastive_loss(embedding1, embedding2, label, margin=1.0): +# # calculate euclidean distance +# distance = F.pairwise_distance(embedding1, embedding2) +# +# # loss for positive pairs +# # label will select on positive examples +# positive_loss = label * torch.pow(distance, 2) +# +# # loss for negative pairs +# negative_loss = (1 - label) * torch.pow(torch.clamp(margin - distance, min=0), 2) +# +# loss = torch.mean(positive_loss + negative_loss) +# return loss + + +def contrastive_loss_cosine(embeddings1, embeddings2, label, margin=0.5): + """ + Compute the contrastive loss using cosine similarity. + + Args: + - embeddings1: Tensor of embeddings for one set of pairs, shape (batch_size, embedding_size) + - embeddings2: Tensor of embeddings for the other set of pairs, shape (batch_size, embedding_size) + - label: Tensor of labels, 1 for positive pairs (same class), 0 for negative pairs (different class) + - margin: Margin for negative pairs (default 0.5) + + Returns: + - loss: Contrastive loss based on cosine similarity + """ + # Cosine similarity between the two sets of embeddings + cosine_sim = F.cosine_similarity(embeddings1, embeddings2) + + # For positive pairs, we want the cosine similarity to be close to 1 + positive_loss = label * (1 - cosine_sim) + + # For negative pairs, we want the cosine similarity to be lower than the margin + negative_loss = (1 - label) * F.relu(cosine_sim - margin) + + # Combine the two losses + loss = positive_loss + negative_loss + + # Return the average loss across the batch + return loss.mean() + + +# %% +# training loop +num_epochs = 50 +batch_size = 256 +train_data_size = train_embeds.shape[0] +output_dim = 512 +learning_rate = 2e-6 +steps_per_epoch = math.ceil(train_data_size / batch_size) + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +torch.set_float32_matmul_precision('high') + +linear_model = linear_map( + input_dim=train_embeds.shape[-1], + output_dim=output_dim) + +linear_model = torch.compile(linear_model) +linear_model.to(device) + +optimizer = torch.optim.Adam(linear_model.parameters(), lr=learning_rate) +# Define the lambda function for linear decay +# It should return the multiplier for the learning rate (starts at 1.0 and goes to 0) +def linear_decay(epoch): + return 1 - epoch / num_epochs + +scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=linear_decay) +# %% + +for epoch in tqdm(range(num_epochs)): + with tqdm(total=steps_per_epoch, desc=f"Epoch {epoch+1}/{num_epochs}") as pbar: + for _ in range(steps_per_epoch): + emb1_batch, emb2_batch, labels_batch = create_pairs( + train_embeds, + label_tensor, + batch_size + ) + output1 = linear_model(emb1_batch.to(device)) + output2 = linear_model(emb2_batch.to(device)) + + loss = contrastive_loss_cosine(output1, output2, labels_batch.to(device), margin=0.7) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + scheduler.step() + + # if epoch % 10 == 0: + # print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}") + pbar.set_postfix({'loss': loss.item()}) + pbar.update(1) + + +# %% +# apply the re-projection layer to achieve better classification +# new_embeds = for loop of model on old embeds + +# we have to transform our previous embeddings into mapped embeddings +def predict_batch(embeds, model, batch_size): + output_list = [] + with torch.no_grad(): + for i in range(0, len(embeds), batch_size): + batch_embed = embeds[i:i+batch_size] + output = model(batch_embed.to(device)) + output_list.append(output) + + all_embeddings = torch.cat(output_list, dim=0) + return all_embeddings + +train_remap_embeds = predict_batch(train_embeds, linear_model, 32) + + +#################################################### +# %% +# we can start classifying + +# %% +import torch +import torch.nn as nn +import torch.optim as optim + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Define the neural network with non-linearity +class NeuralNet(nn.Module): + def __init__(self, input_dim, output_dim): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_dim, 512) # First layer (input to hidden) + self.relu = nn.ReLU() # Non-linearity + self.fc2 = nn.Linear(512, 256) # Output layer + self.fc3 = nn.Linear(256, output_dim) + + def forward(self, x): + out = self.fc1(x) # Input to hidden + out = self.relu(out) # Apply non-linearity + out = self.fc2(out) # Hidden to output + out = self.relu(out) + out = self.fc3(out) + return out + +# Example usage +input_dim = 512 # Example input dimension (adjust based on your mean embedding size) +output_dim = 202 # 202 classes + +model = NeuralNet(input_dim, output_dim) +model = torch.compile(model) +model = model.to(device) +torch.set_float32_matmul_precision('high') + +# %% +from torch.utils.data import DataLoader, TensorDataset + +# we use the re-projected embeds +mean_embeddings = train_remap_embeds +# mean_embeddings = train_embeds +# labels = torch.randint(0, 2, (1000,)) # Random binary labels (0 for OOD, 1 for ID) + +train_labels = generate_labels(train_df, mdm_list) +labels = torch.tensor(train_labels) + +# Create a dataset and DataLoader +dataset = TensorDataset(mean_embeddings, labels) +dataloader = DataLoader(dataset, batch_size=256, shuffle=True) +# %% +# Define loss function and optimizer +# criterion = nn.BCELoss() # Binary cross entropy loss +# criterion = nn.BCEWithLogitsLoss() +criterion = nn.CrossEntropyLoss() +learning_rate = 1e-3 +optimizer = optim.Adam(model.parameters(), lr=learning_rate) + +# Define the scheduler + + +# Training loop +num_epochs = 800 # Adjust as needed + + +# Define the lambda function for linear decay +# It should return the multiplier for the learning rate (starts at 1.0 and goes to 0) +def linear_decay(epoch): + return 1 - epoch / num_epochs + +scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=linear_decay) + +for epoch in range(num_epochs): + model.train() + running_loss = 0.0 + for inputs, targets in dataloader: + # Forward pass + inputs = inputs.to(device) + targets = targets.to(device) + outputs = model(inputs) + # loss = criterion(outputs.squeeze(), targets.float().squeeze()) # Ensure the target is float + loss = criterion(outputs, targets) + + # Backward pass and optimization + optimizer.zero_grad() + loss.backward() + optimizer.step() + + + running_loss += loss.item() + + + scheduler.step() + + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(dataloader)}") + + + +# %% +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +test_df = pd.read_csv(data_path, skipinitialspace=True) +test_df = test_df[test_df['MDM']].reset_index(drop=True) + +checkpoint_directory = "../../train/baseline" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +test_embedder = Embedder(input_df=test_df) +test_embeds = test_embedder.make_embedding(checkpoint_path) + +test_remap_embeds = predict_batch(test_embeds, linear_model, 32) + + +test_labels = generate_labels(test_df, mdm_list) +# %% +# mean_embeddings = test_embeds +mean_embeddings = test_remap_embeds + +labels = torch.tensor(test_labels) +dataset = TensorDataset(mean_embeddings, labels) +dataloader = DataLoader(dataset, batch_size=64, shuffle=False) + +model.eval() +output_classes = [] +output_probs = [] +for inputs, _ in dataloader: + with torch.no_grad(): + inputs = inputs.to(device) + logits = model(inputs) + probabilities = torch.softmax(logits, dim=1) + # predicted_classes = torch.argmax(probabilities, dim=1) + max_probabilities, predicted_classes = torch.max(probabilities, dim=1) + output_classes.extend(predicted_classes.to('cpu').numpy()) + output_probs.extend(max_probabilities.to('cpu').numpy()) + + +# %% +# evaluation +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +y_true = test_labels +y_pred = output_classes + +# Compute metrics +accuracy = accuracy_score(y_true, y_pred) +f1 = f1_score(y_true, y_pred, average='macro') +precision = precision_score(y_true, y_pred, average='macro') +recall = recall_score(y_true, y_pred, average='macro') + +# Print the results +print(f'Accuracy: {accuracy:.2f}') +print(f'F1 Score: {f1:.2f}') +print(f'Precision: {precision:.2f}') +print(f'Recall: {recall:.2f}') + + +# %% diff --git a/train/classification_mdm_with_contrastive/utils.py b/train/classification_mdm_with_contrastive/utils.py new file mode 100644 index 0000000..12a0ac5 --- /dev/null +++ b/train/classification_mdm_with_contrastive/utils.py @@ -0,0 +1,75 @@ +import torch +from transformers import AutoTokenizer +from transformers import AutoModelForSeq2SeqLM +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_mean_embedding(self, batch_size=32): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=16): + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + # Expand batch1 chunk and entire batch2 for comparison + batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + + # Compute cosine similarity for the chunk and store it in the final tensor + cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + return cos_sim + diff --git a/train/baseline/.gitignore b/train/mapping_baseline/.gitignore similarity index 100% rename from train/baseline/.gitignore rename to train/mapping_baseline/.gitignore diff --git a/train/mapping/.gitignore b/train/mapping_baseline/mapping_prediction/.gitignore similarity index 100% rename from train/mapping/.gitignore rename to train/mapping_baseline/mapping_prediction/.gitignore diff --git a/train/mapping/inference.py b/train/mapping_baseline/mapping_prediction/inference.py similarity index 100% rename from train/mapping/inference.py rename to train/mapping_baseline/mapping_prediction/inference.py diff --git a/train/mapping/output.txt b/train/mapping_baseline/mapping_prediction/output.txt similarity index 100% rename from train/mapping/output.txt rename to train/mapping_baseline/mapping_prediction/output.txt diff --git a/train/mapping/predict.py b/train/mapping_baseline/mapping_prediction/predict.py similarity index 91% rename from train/mapping/predict.py rename to train/mapping_baseline/mapping_prediction/predict.py index fe54e2e..5dbf024 100644 --- a/train/mapping/predict.py +++ b/train/mapping_baseline/mapping_prediction/predict.py @@ -4,16 +4,16 @@ import os import glob from inference import Inference -checkpoint_directory = '../../train/baseline' +checkpoint_directory = '../' def infer_and_select(fold): print(f"Inference for fold {fold}") # import test data - data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) # get target data - data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" train_df = pd.read_csv(data_path, skipinitialspace=True) # processing to help with selection later train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] diff --git a/train/baseline/train.py b/train/mapping_baseline/train.py similarity index 100% rename from train/baseline/train.py rename to train/mapping_baseline/train.py diff --git a/train/mapping_pattern/.gitignore b/train/mapping_pattern/.gitignore new file mode 100644 index 0000000..2e7f3f7 --- /dev/null +++ b/train/mapping_pattern/.gitignore @@ -0,0 +1,2 @@ +checkpoint* +tensorboard-log/ diff --git a/train/mapping_pattern/mapping_prediction/.gitignore b/train/mapping_pattern/mapping_prediction/.gitignore new file mode 100644 index 0000000..e9ebfc9 --- /dev/null +++ b/train/mapping_pattern/mapping_prediction/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +exports/ diff --git a/train/mapping_pattern/mapping_prediction/inference.py b/train/mapping_pattern/mapping_prediction/inference.py new file mode 100644 index 0000000..2232be0 --- /dev/null +++ b/train/mapping_pattern/mapping_prediction/inference.py @@ -0,0 +1,162 @@ +import torch +from torch.utils.data import DataLoader +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, +) +import os +from tqdm import tqdm +from datasets import Dataset +import numpy as np + +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + +class Inference(): + tokenizer: T5TokenizerFast + model: torch.nn.Module + dataloader: DataLoader + + def __init__(self, checkpoint_path): + self._create_tokenizer() + self._load_model(checkpoint_path) + + + def _create_tokenizer(self): + # %% + # load tokenizer + self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "SIG", "UNIT", "DATA_TYPE"] + # Add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + def _load_model(self, checkpoint_path: str): + # load model + # Define the directory and the pattern + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) + model = torch.compile(model) + # set model to eval + self.model = model.eval() + + + + + def prepare_dataloader(self, input_df, batch_size, max_length): + """ + *arguments* + - input_df: input dataframe containing fields 'tag_description', 'thing', 'property' + - batch_size: the batch size of dataloader output + - max_length: length of tokenizer output + """ + print("preparing dataloader") + # convert each dataframe row into a dictionary + # outputs a list of dictionaries + def _process_df(df): + output_list = [{ + 'input': f"{row['tag_description']}", + 'output': f"{row['thing']}{row['property']}", + } for _, row in df.iterrows()] + + return output_list + + def _preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = self.tokenizer( + input, + text_target=target, + max_length=max_length, + return_tensors="pt", + padding='max_length', + truncation=True, + ) + return model_inputs + + test_dataset = Dataset.from_list(_process_df(input_df)) + + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + datasets = test_dataset.map( + _preprocess_function, + batched=True, + num_proc=1, + remove_columns=test_dataset.column_names, + ) + # datasets = _preprocess_function(test_dataset) + datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) + + # create dataloader + self.dataloader = DataLoader(datasets, batch_size=batch_size) + + + def generate(self): + device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') + MAX_GENERATE_LENGTH = 128 + + pred_generations = [] + pred_labels = [] + + print("start generation") + for batch in tqdm(self.dataloader): + # Inference in batches + input_ids = batch['input_ids'] + attention_mask = batch['attention_mask'] + # save labels too + pred_labels.extend(batch['labels']) + + + # Move to GPU if available + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + self.model.to(device) + + # Perform inference + with torch.no_grad(): + outputs = self.model.generate(input_ids, + attention_mask=attention_mask, + max_length=MAX_GENERATE_LENGTH) + + # Decode the output and print the results + pred_generations.extend(outputs.to("cpu")) + + + + # %% + # extract sequence and decode + def extract_seq(tokens, start_value, end_value): + if start_value not in tokens or end_value not in tokens: + return None # Or handle this case according to your requirements + start_id = np.where(tokens == start_value)[0][0] + end_id = np.where(tokens == end_value)[0][0] + + return tokens[start_id+1:end_id] + + + def process_tensor_output(tokens): + thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = , 32101 = + property_seq = extract_seq(tokens, 32102, 32103) # 32102 = , 32103 = + p_thing = None + p_property = None + if (thing_seq is not None): + p_thing = self.tokenizer.decode(thing_seq, skip_special_tokens=False) + if (property_seq is not None): + p_property = self.tokenizer.decode(property_seq, skip_special_tokens=False) + return p_thing, p_property + + # decode prediction labels + def decode_preds(tokens_list): + thing_prediction_list = [] + property_prediction_list = [] + for tokens in tokens_list: + p_thing, p_property = process_tensor_output(tokens) + thing_prediction_list.append(p_thing) + property_prediction_list.append(p_property) + return thing_prediction_list, property_prediction_list + + thing_prediction_list, property_prediction_list = decode_preds(pred_generations) + return thing_prediction_list, property_prediction_list + diff --git a/train/mapping_pattern/mapping_prediction/output.txt b/train/mapping_pattern/mapping_prediction/output.txt new file mode 100644 index 0000000..fd0dabc --- /dev/null +++ b/train/mapping_pattern/mapping_prediction/output.txt @@ -0,0 +1,6 @@ + +Accuracy for fold 1: 0.943208707998107 +Accuracy for fold 2: 0.9214953271028037 +Accuracy for fold 3: 0.9728915662650602 +Accuracy for fold 4: 0.967174119885823 +Accuracy for fold 5: 0.9097572148419606 diff --git a/train/mapping_pattern/mapping_prediction/predict.py b/train/mapping_pattern/mapping_prediction/predict.py new file mode 100644 index 0000000..102c506 --- /dev/null +++ b/train/mapping_pattern/mapping_prediction/predict.py @@ -0,0 +1,71 @@ + +import pandas as pd +import os +import glob +from inference import Inference + +checkpoint_directory = '../' + +def infer_and_select(fold): + print(f"Inference for fold {fold}") + # import test data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" + df = pd.read_csv(data_path, skipinitialspace=True) + + # get target data + data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + # processing to help with selection later + train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] + + + ########################################## + # run inference + # checkpoint + # Use glob to find matching paths + directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') + # Use glob to find matching paths + # path is usually checkpoint_fold_1/checkpoint- + # we are guaranteed to save only 1 checkpoint from training + pattern = 'checkpoint-*' + checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + + + infer = Inference(checkpoint_path) + infer.prepare_dataloader(df, batch_size=256, max_length=128) + thing_prediction_list, property_prediction_list = infer.generate() + + # add labels too + # thing_actual_list, property_actual_list = decode_preds(pred_labels) + # Convert the list to a Pandas DataFrame + df_out = pd.DataFrame({ + 'p_thing': thing_prediction_list, + 'p_property': property_prediction_list + }) + # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] + # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] + df = pd.concat([df, df_out], axis=1) + + # we can save the t5 generation output here + df.to_csv(f"exports/result_group_{fold}.csv", index=False) + + # here we want to evaluate mapping accuracy within the valid in mdm data only + in_mdm = df['MDM'] + condition_correct_thing = df['p_thing'] == df['thing_pattern'] + condition_correct_property = df['p_property'] == df['property_pattern'] + prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property & in_mdm) + pred_correct_proportion = prediction_mdm_correct/sum(in_mdm) + + # write output to file output.txt + with open("output.txt", "a") as f: + print(f'Accuracy for fold {fold}: {pred_correct_proportion}', file=f) + +########################################### +# Execute for all folds + +# reset file before writing to it +with open("output.txt", "w") as f: + print('', file=f) + +for fold in [1,2,3,4,5]: + infer_and_select(fold) diff --git a/train/mapping_pattern/train.py b/train/mapping_pattern/train.py new file mode 100644 index 0000000..26da0e0 --- /dev/null +++ b/train/mapping_pattern/train.py @@ -0,0 +1,195 @@ +# %% + +# from datasets import load_from_disk +import os + +os.environ['NCCL_P2P_DISABLE'] = '1' +os.environ['NCCL_IB_DISABLE'] = '1' +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" + +import torch +from transformers import ( + T5TokenizerFast, + AutoModelForSeq2SeqLM, + DataCollatorForSeq2Seq, + Seq2SeqTrainer, + EarlyStoppingCallback, + Seq2SeqTrainingArguments +) +import evaluate +import numpy as np +import pandas as pd +# import matplotlib.pyplot as plt +from datasets import Dataset, DatasetDict + + + +torch.set_float32_matmul_precision('high') + +# outputs a list of dictionaries +def process_df_to_dict(df): + output_list = [] + for _, row in df.iterrows(): + desc = f"{row['tag_description']}" + element = { + 'input' : f"{desc}", + 'output': f"{row['thing_pattern']}{row['property_pattern']}", + } + output_list.append(element) + + return output_list + + +def create_split_dataset(fold): + # train + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train.csv" + train_df = pd.read_csv(data_path, skipinitialspace=True) + + # valid + data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv" + validation_df = pd.read_csv(data_path, skipinitialspace=True) + + combined_data = DatasetDict({ + 'train': Dataset.from_list(process_df_to_dict(train_df)), + 'validation' : Dataset.from_list(process_df_to_dict(validation_df)), + }) + return combined_data + + +# function to perform training for a given fold +def train(fold): + save_path = f'checkpoint_fold_{fold}' + split_datasets = create_split_dataset(fold) + + # prepare tokenizer + + model_checkpoint = "t5-small" + tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True) + # Define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # Add the additional special tokens to the tokenizer + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + max_length = 120 + + # given a dataset entry, run it through the tokenizer + def preprocess_function(example): + input = example['input'] + target = example['output'] + # text_target sets the corresponding label to inputs + # there is no need to create a separate 'labels' + model_inputs = tokenizer( + input, + text_target=target, + max_length=max_length, + truncation=True, + padding=True + ) + return model_inputs + + # map maps function to each "row" in the dataset + # aka the data in the immediate nesting + tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + num_proc=8, + remove_columns=split_datasets["train"].column_names, + ) + + # https://github.com/huggingface/transformers/pull/28414 + # model_checkpoint = "google/t5-efficient-tiny" + # device_map set to auto to force it to load contiguous weights + # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + # important! after extending tokens vocab + model.resize_token_embeddings(len(tokenizer)) + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + metric = evaluate.load("sacrebleu") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, + skip_special_tokens=False) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=False) + + # Remove tokens from decoded predictions and labels + decoded_preds = [pred.replace(tokenizer.pad_token, '').strip() for pred in decoded_preds] + decoded_labels = [[label.replace(tokenizer.pad_token, '').strip()] for label in decoded_labels] + + # Some simple post-processing + # decoded_preds = [pred.strip() for pred in decoded_preds] + # decoded_labels = [[label.strip()] for label in decoded_labels] + # print(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + + # Generation Config + # from transformers import GenerationConfig + gen_config = model.generation_config + gen_config.max_length = 64 + + # compile + # model = torch.compile(model, backend="inductor", dynamic=True) + + + # Trainer + + args = Seq2SeqTrainingArguments( + f"{save_path}", + eval_strategy="epoch", + logging_dir="tensorboard-log", + logging_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + learning_rate=1e-3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + auto_find_batch_size=False, + ddp_find_unused_parameters=False, + weight_decay=0.01, + save_total_limit=1, + num_train_epochs=20, + predict_with_generate=True, + bf16=True, + push_to_hub=False, + generation_config=gen_config, + remove_unused_columns=False, + ) + + + trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], + ) + + # uncomment to load training from checkpoint + # checkpoint_path = 'default_40_1/checkpoint-5600' + # trainer.train(resume_from_checkpoint=checkpoint_path) + + trainer.train() + +# execute training +for fold in [1,2,3,4,5]: + print(fold) + train(fold) +