diff --git a/post_process/classification/train.py b/post_process/classification/train.py deleted file mode 100644 index e69de29..0000000 diff --git a/train/classification_all/.gitignore b/train/classification_all/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/train/classification_all/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/train/classification_all/train.py b/train/classification_all/train.py new file mode 100644 index 0000000..8f4d5e8 --- /dev/null +++ b/train/classification_all/train.py @@ -0,0 +1,249 @@ +# %% +import pandas as pd +import numpy as np +from typing import List +from tqdm import tqdm +from utils import Retriever, cosine_similarity_chunked +import glob +import os + +# %% +class Embedder(): + input_df: pd.DataFrame + fold: int + + def __init__(self, input_df): + self.input_df = input_df + + + def make_embedding(self, checkpoint_path): + + def generate_input_list(df): + input_list = [] + for _, row in df.iterrows(): + # name = f"{row['tag_name']}" + desc = f"{row['tag_description']}" + # element = f"{name}{desc}" + element = f"{desc}" + input_list.append(element) + return input_list + + # prepare reference embed + train_data = list(generate_input_list(self.input_df)) + # Define the directory and the pattern + retriever_train = Retriever(train_data, checkpoint_path) + retriever_train.make_mean_embedding(batch_size=64) + return retriever_train.embeddings.to('cpu') + +# %% +# input data +fold = 2 +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +test_df = pd.read_csv(data_path, skipinitialspace=True) +ships_list = list(set(test_df['ships_idx'])) + +# %% +data_path = '../../data_preprocess/exports/preprocessed_data.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +train_df = full_df[~full_df['ships_idx'].isin(ships_list)] + +# %% +checkpoint_directory = "../../train/baseline" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +train_embedder = Embedder(input_df=train_df) +train_embeds = train_embedder.make_embedding(checkpoint_path) + + +# %% +train_embeds.shape + +# %% +# now we need to generate the class labels +data_path = '../../data_import/exports/data_mapping_mdm.csv' +full_df = pd.read_csv(data_path, skipinitialspace=True) +mdm_list = sorted(list((set(full_df['pattern'])))) + +# %% +# based on the mdm_labels, we assign a value to the dataframe +def generate_labels(df, mdm_list): + label_list = [] + for _, row in df.iterrows(): + pattern = row['pattern'] + try: + index = mdm_list.index(pattern) + label_list.append(index + 1) + except ValueError: + label_list.append(0) + + return label_list + +# %% +label_list = generate_labels(train_df, mdm_list) + +# %% +from collections import Counter + +frequency = Counter(label_list) +frequency + +#################################################### +# %% +# we can start classifying + +# %% +import torch +import torch.nn as nn +import torch.optim as optim + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Define the neural network with non-linearity +class NeuralNet(nn.Module): + def __init__(self, input_dim, output_dim): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_dim, 512) # First layer (input to hidden) + self.relu = nn.ReLU() # Non-linearity + self.fc2 = nn.Linear(512, 256) # Output layer + self.fc3 = nn.Linear(256, output_dim) + + def forward(self, x): + out = self.fc1(x) # Input to hidden + out = self.relu(out) # Apply non-linearity + out = self.fc2(out) # Hidden to output + out = self.relu(out) + out = self.fc3(out) + return out + +# Example usage +input_dim = 512 # Example input dimension (adjust based on your mean embedding size) +output_dim = 203 # 202 classes + 1 out + +model = NeuralNet(input_dim, output_dim) +model = torch.compile(model) +model = model.to(device) +torch.set_float32_matmul_precision('high') + +# %% +from torch.utils.data import DataLoader, TensorDataset + +# Example mean embeddings and labels (replace these with your actual data) +# mean_embeddings = torch.randn(1000, embedding_dim) # 1000 samples of embedding_dim size +mean_embeddings = train_embeds +# labels = torch.randint(0, 2, (1000,)) # Random binary labels (0 for OOD, 1 for ID) + +train_labels = generate_labels(train_df, mdm_list) +labels = torch.tensor(train_labels) + +# Create a dataset and DataLoader +dataset = TensorDataset(mean_embeddings, labels) +dataloader = DataLoader(dataset, batch_size=256, shuffle=True) +# %% +# Define loss function and optimizer +# criterion = nn.BCELoss() # Binary cross entropy loss +# criterion = nn.BCEWithLogitsLoss() +criterion = nn.CrossEntropyLoss() +optimizer = optim.Adam(model.parameters(), lr=1e-4) + +# Define the scheduler + + +# Training loop +num_epochs = 200 # Adjust as needed + + +# Define the lambda function for linear decay +# It should return the multiplier for the learning rate (starts at 1.0 and goes to 0) +def linear_decay(epoch): + return 1 - epoch / num_epochs + +scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=linear_decay) + +for epoch in range(num_epochs): + model.train() + running_loss = 0.0 + for inputs, targets in dataloader: + # Forward pass + inputs = inputs.to(device) + targets = targets.to(device) + outputs = model(inputs) + # loss = criterion(outputs.squeeze(), targets.float().squeeze()) # Ensure the target is float + loss = criterion(outputs, targets) + + # Backward pass and optimization + optimizer.zero_grad() + loss.backward() + optimizer.step() + + + running_loss += loss.item() + + + scheduler.step() + + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(dataloader)}") + + + +# %% +data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv" +test_df = pd.read_csv(data_path, skipinitialspace=True) + +checkpoint_directory = "../../train/baseline" +directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}') +# Use glob to find matching paths +# path is usually checkpoint_fold_1/checkpoint- +# we are guaranteed to save only 1 checkpoint from training +pattern = 'checkpoint-*' +checkpoint_path = glob.glob(os.path.join(directory, pattern))[0] + +test_embedder = Embedder(input_df=test_df) +test_embeds = test_embedder.make_embedding(checkpoint_path) + +test_labels = generate_labels(test_df, mdm_list) +# %% +mean_embeddings = test_embeds +labels = torch.tensor(test_labels) +dataset = TensorDataset(mean_embeddings, labels) +dataloader = DataLoader(dataset, batch_size=64, shuffle=False) + +model.eval() +output_classes = [] +output_probs = [] +for inputs, _ in dataloader: + with torch.no_grad(): + inputs = inputs.to(device) + logits = model(inputs) + probabilities = torch.softmax(logits, dim=1) + # predicted_classes = torch.argmax(probabilities, dim=1) + max_probabilities, predicted_classes = torch.max(probabilities, dim=1) + output_classes.extend(predicted_classes.to('cpu').numpy()) + output_probs.extend(max_probabilities.to('cpu').numpy()) + + +# %% +# evaluation +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix +y_true = test_labels +y_pred = output_classes + +# Compute metrics +accuracy = accuracy_score(y_true, y_pred) +f1 = f1_score(y_true, y_pred, average='macro') +precision = precision_score(y_true, y_pred, average='macro') +recall = recall_score(y_true, y_pred, average='macro') + +# Print the results +print(f'Accuracy: {accuracy:.2f}') +print(f'F1 Score: {f1:.2f}') +print(f'Precision: {precision:.2f}') +print(f'Recall: {recall:.2f}') + + +# %% diff --git a/train/classification_all/utils.py b/train/classification_all/utils.py new file mode 100644 index 0000000..12a0ac5 --- /dev/null +++ b/train/classification_all/utils.py @@ -0,0 +1,75 @@ +import torch +from transformers import AutoTokenizer +from transformers import AutoModelForSeq2SeqLM +import torch.nn.functional as F + + + +class Retriever: + def __init__(self, input_texts, model_checkpoint): + # we need to generate the embedding from list of input strings + self.embeddings = [] + self.inputs = input_texts + model_checkpoint = model_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True) + # define additional special tokens + additional_special_tokens = ["", "", "", "", "", "", "", "", ""] + # add the additional special tokens to the tokenizer + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + + model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + # device = "cpu" + model.to(self.device) + self.model = model.eval() + + + + + def make_mean_embedding(self, batch_size=32): + all_embeddings = self.embeddings + input_texts = self.inputs + + for i in range(0, len(input_texts), batch_size): + batch_texts = input_texts[i:i+batch_size] + # Tokenize the input text + inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128) + input_ids = inputs.input_ids.to(self.device) + attention_mask = inputs.attention_mask.to(self.device) + + + # Pass the input through the encoder and retrieve the embeddings + with torch.no_grad(): + encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask) + embeddings = encoder_outputs.last_hidden_state + + # Compute the mean pooling of the token embeddings + # mean_embedding = embeddings.mean(dim=1) + mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True) + all_embeddings.append(mean_embedding) + + # remove the batch list and makes a single large tensor, dim=0 increases row-wise + all_embeddings = torch.cat(all_embeddings, dim=0) + + self.embeddings = all_embeddings + +def cosine_similarity_chunked(batch1, batch2, chunk_size=16): + batch1_size = batch1.size(0) + batch2_size = batch2.size(0) + + # Prepare an empty tensor to store results + cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device) + + # Process batch1 in chunks + for i in range(0, batch1_size, chunk_size): + batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1 + + # Expand batch1 chunk and entire batch2 for comparison + batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len) + batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len) + + # Compute cosine similarity for the chunk and store it in the final tensor + cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1) + + return cos_sim +