Feat: added classification for all data, including non-mdm, as a
training baseline model
This commit is contained in:
parent
0228c5c0fd
commit
0ad182f2b9
|
@ -0,0 +1 @@
|
||||||
|
__pycache__
|
|
@ -0,0 +1,249 @@
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from typing import List
|
||||||
|
from tqdm import tqdm
|
||||||
|
from utils import Retriever, cosine_similarity_chunked
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
|
||||||
|
# %%
|
||||||
|
class Embedder():
|
||||||
|
input_df: pd.DataFrame
|
||||||
|
fold: int
|
||||||
|
|
||||||
|
def __init__(self, input_df):
|
||||||
|
self.input_df = input_df
|
||||||
|
|
||||||
|
|
||||||
|
def make_embedding(self, checkpoint_path):
|
||||||
|
|
||||||
|
def generate_input_list(df):
|
||||||
|
input_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
# name = f"<NAME>{row['tag_name']}<NAME>"
|
||||||
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
# element = f"{name}{desc}"
|
||||||
|
element = f"{desc}"
|
||||||
|
input_list.append(element)
|
||||||
|
return input_list
|
||||||
|
|
||||||
|
# prepare reference embed
|
||||||
|
train_data = list(generate_input_list(self.input_df))
|
||||||
|
# Define the directory and the pattern
|
||||||
|
retriever_train = Retriever(train_data, checkpoint_path)
|
||||||
|
retriever_train.make_mean_embedding(batch_size=64)
|
||||||
|
return retriever_train.embeddings.to('cpu')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# input data
|
||||||
|
fold = 2
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
ships_list = list(set(test_df['ships_idx']))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
train_df = full_df[~full_df['ships_idx'].isin(ships_list)]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
checkpoint_directory = "../../train/baseline"
|
||||||
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint-*'
|
||||||
|
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||||
|
|
||||||
|
train_embedder = Embedder(input_df=train_df)
|
||||||
|
train_embeds = train_embedder.make_embedding(checkpoint_path)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
train_embeds.shape
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# now we need to generate the class labels
|
||||||
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
||||||
|
full_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
mdm_list = sorted(list((set(full_df['pattern']))))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# based on the mdm_labels, we assign a value to the dataframe
|
||||||
|
def generate_labels(df, mdm_list):
|
||||||
|
label_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
pattern = row['pattern']
|
||||||
|
try:
|
||||||
|
index = mdm_list.index(pattern)
|
||||||
|
label_list.append(index + 1)
|
||||||
|
except ValueError:
|
||||||
|
label_list.append(0)
|
||||||
|
|
||||||
|
return label_list
|
||||||
|
|
||||||
|
# %%
|
||||||
|
label_list = generate_labels(train_df, mdm_list)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
frequency = Counter(label_list)
|
||||||
|
frequency
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# %%
|
||||||
|
# we can start classifying
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.optim as optim
|
||||||
|
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
# Define the neural network with non-linearity
|
||||||
|
class NeuralNet(nn.Module):
|
||||||
|
def __init__(self, input_dim, output_dim):
|
||||||
|
super(NeuralNet, self).__init__()
|
||||||
|
self.fc1 = nn.Linear(input_dim, 512) # First layer (input to hidden)
|
||||||
|
self.relu = nn.ReLU() # Non-linearity
|
||||||
|
self.fc2 = nn.Linear(512, 256) # Output layer
|
||||||
|
self.fc3 = nn.Linear(256, output_dim)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
out = self.fc1(x) # Input to hidden
|
||||||
|
out = self.relu(out) # Apply non-linearity
|
||||||
|
out = self.fc2(out) # Hidden to output
|
||||||
|
out = self.relu(out)
|
||||||
|
out = self.fc3(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
input_dim = 512 # Example input dimension (adjust based on your mean embedding size)
|
||||||
|
output_dim = 203 # 202 classes + 1 out
|
||||||
|
|
||||||
|
model = NeuralNet(input_dim, output_dim)
|
||||||
|
model = torch.compile(model)
|
||||||
|
model = model.to(device)
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from torch.utils.data import DataLoader, TensorDataset
|
||||||
|
|
||||||
|
# Example mean embeddings and labels (replace these with your actual data)
|
||||||
|
# mean_embeddings = torch.randn(1000, embedding_dim) # 1000 samples of embedding_dim size
|
||||||
|
mean_embeddings = train_embeds
|
||||||
|
# labels = torch.randint(0, 2, (1000,)) # Random binary labels (0 for OOD, 1 for ID)
|
||||||
|
|
||||||
|
train_labels = generate_labels(train_df, mdm_list)
|
||||||
|
labels = torch.tensor(train_labels)
|
||||||
|
|
||||||
|
# Create a dataset and DataLoader
|
||||||
|
dataset = TensorDataset(mean_embeddings, labels)
|
||||||
|
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)
|
||||||
|
# %%
|
||||||
|
# Define loss function and optimizer
|
||||||
|
# criterion = nn.BCELoss() # Binary cross entropy loss
|
||||||
|
# criterion = nn.BCEWithLogitsLoss()
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
optimizer = optim.Adam(model.parameters(), lr=1e-4)
|
||||||
|
|
||||||
|
# Define the scheduler
|
||||||
|
|
||||||
|
|
||||||
|
# Training loop
|
||||||
|
num_epochs = 200 # Adjust as needed
|
||||||
|
|
||||||
|
|
||||||
|
# Define the lambda function for linear decay
|
||||||
|
# It should return the multiplier for the learning rate (starts at 1.0 and goes to 0)
|
||||||
|
def linear_decay(epoch):
|
||||||
|
return 1 - epoch / num_epochs
|
||||||
|
|
||||||
|
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=linear_decay)
|
||||||
|
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
model.train()
|
||||||
|
running_loss = 0.0
|
||||||
|
for inputs, targets in dataloader:
|
||||||
|
# Forward pass
|
||||||
|
inputs = inputs.to(device)
|
||||||
|
targets = targets.to(device)
|
||||||
|
outputs = model(inputs)
|
||||||
|
# loss = criterion(outputs.squeeze(), targets.float().squeeze()) # Ensure the target is float
|
||||||
|
loss = criterion(outputs, targets)
|
||||||
|
|
||||||
|
# Backward pass and optimization
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
|
||||||
|
running_loss += loss.item()
|
||||||
|
|
||||||
|
|
||||||
|
scheduler.step()
|
||||||
|
|
||||||
|
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(dataloader)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
|
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
|
||||||
|
checkpoint_directory = "../../train/baseline"
|
||||||
|
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
|
||||||
|
# Use glob to find matching paths
|
||||||
|
# path is usually checkpoint_fold_1/checkpoint-<step number>
|
||||||
|
# we are guaranteed to save only 1 checkpoint from training
|
||||||
|
pattern = 'checkpoint-*'
|
||||||
|
checkpoint_path = glob.glob(os.path.join(directory, pattern))[0]
|
||||||
|
|
||||||
|
test_embedder = Embedder(input_df=test_df)
|
||||||
|
test_embeds = test_embedder.make_embedding(checkpoint_path)
|
||||||
|
|
||||||
|
test_labels = generate_labels(test_df, mdm_list)
|
||||||
|
# %%
|
||||||
|
mean_embeddings = test_embeds
|
||||||
|
labels = torch.tensor(test_labels)
|
||||||
|
dataset = TensorDataset(mean_embeddings, labels)
|
||||||
|
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
output_classes = []
|
||||||
|
output_probs = []
|
||||||
|
for inputs, _ in dataloader:
|
||||||
|
with torch.no_grad():
|
||||||
|
inputs = inputs.to(device)
|
||||||
|
logits = model(inputs)
|
||||||
|
probabilities = torch.softmax(logits, dim=1)
|
||||||
|
# predicted_classes = torch.argmax(probabilities, dim=1)
|
||||||
|
max_probabilities, predicted_classes = torch.max(probabilities, dim=1)
|
||||||
|
output_classes.extend(predicted_classes.to('cpu').numpy())
|
||||||
|
output_probs.extend(max_probabilities.to('cpu').numpy())
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# evaluation
|
||||||
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
|
||||||
|
y_true = test_labels
|
||||||
|
y_pred = output_classes
|
||||||
|
|
||||||
|
# Compute metrics
|
||||||
|
accuracy = accuracy_score(y_true, y_pred)
|
||||||
|
f1 = f1_score(y_true, y_pred, average='macro')
|
||||||
|
precision = precision_score(y_true, y_pred, average='macro')
|
||||||
|
recall = recall_score(y_true, y_pred, average='macro')
|
||||||
|
|
||||||
|
# Print the results
|
||||||
|
print(f'Accuracy: {accuracy:.2f}')
|
||||||
|
print(f'F1 Score: {f1:.2f}')
|
||||||
|
print(f'Precision: {precision:.2f}')
|
||||||
|
print(f'Recall: {recall:.2f}')
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,75 @@
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from transformers import AutoModelForSeq2SeqLM
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Retriever:
|
||||||
|
def __init__(self, input_texts, model_checkpoint):
|
||||||
|
# we need to generate the embedding from list of input strings
|
||||||
|
self.embeddings = []
|
||||||
|
self.inputs = input_texts
|
||||||
|
model_checkpoint = model_checkpoint
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# define additional special tokens
|
||||||
|
additional_special_tokens = ["<thing_start>", "<thing_end>", "<property_start>", "<property_end>", "<name>", "<desc>", "<sig>", "<unit>", "<data_type>"]
|
||||||
|
# add the additional special tokens to the tokenizer
|
||||||
|
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
||||||
|
self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
|
||||||
|
# device = "cpu"
|
||||||
|
model.to(self.device)
|
||||||
|
self.model = model.eval()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def make_mean_embedding(self, batch_size=32):
|
||||||
|
all_embeddings = self.embeddings
|
||||||
|
input_texts = self.inputs
|
||||||
|
|
||||||
|
for i in range(0, len(input_texts), batch_size):
|
||||||
|
batch_texts = input_texts[i:i+batch_size]
|
||||||
|
# Tokenize the input text
|
||||||
|
inputs = self.tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
|
||||||
|
input_ids = inputs.input_ids.to(self.device)
|
||||||
|
attention_mask = inputs.attention_mask.to(self.device)
|
||||||
|
|
||||||
|
|
||||||
|
# Pass the input through the encoder and retrieve the embeddings
|
||||||
|
with torch.no_grad():
|
||||||
|
encoder_outputs = self.model.encoder(input_ids, attention_mask=attention_mask)
|
||||||
|
embeddings = encoder_outputs.last_hidden_state
|
||||||
|
|
||||||
|
# Compute the mean pooling of the token embeddings
|
||||||
|
# mean_embedding = embeddings.mean(dim=1)
|
||||||
|
mean_embedding = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
|
||||||
|
all_embeddings.append(mean_embedding)
|
||||||
|
|
||||||
|
# remove the batch list and makes a single large tensor, dim=0 increases row-wise
|
||||||
|
all_embeddings = torch.cat(all_embeddings, dim=0)
|
||||||
|
|
||||||
|
self.embeddings = all_embeddings
|
||||||
|
|
||||||
|
def cosine_similarity_chunked(batch1, batch2, chunk_size=16):
|
||||||
|
batch1_size = batch1.size(0)
|
||||||
|
batch2_size = batch2.size(0)
|
||||||
|
|
||||||
|
# Prepare an empty tensor to store results
|
||||||
|
cos_sim = torch.empty(batch1_size, batch2_size, device=batch1.device)
|
||||||
|
|
||||||
|
# Process batch1 in chunks
|
||||||
|
for i in range(0, batch1_size, chunk_size):
|
||||||
|
batch1_chunk = batch1[i:i + chunk_size] # Get chunk of batch1
|
||||||
|
|
||||||
|
# Expand batch1 chunk and entire batch2 for comparison
|
||||||
|
batch1_chunk_exp = batch1_chunk.unsqueeze(1) # Shape: (chunk_size, 1, seq_len)
|
||||||
|
batch2_exp = batch2.unsqueeze(0) # Shape: (1, batch2_size, seq_len)
|
||||||
|
|
||||||
|
# Compute cosine similarity for the chunk and store it in the final tensor
|
||||||
|
cos_sim[i:i + chunk_size] = F.cosine_similarity(batch1_chunk_exp, batch2_exp, dim=-1)
|
||||||
|
|
||||||
|
return cos_sim
|
||||||
|
|
Loading…
Reference in New Issue