hipom_data_mapping/interpretation/fold_analysis_bert_multiple.py

# this code tries to analyze the embeddings of the encoder
# %%
import pandas as pd
import os
import glob
from inference import Embedder_bert
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
from sklearn.preprocessing import StandardScaler


checkpoint_directory =  'classification_bert_complete_desc_unit/checkpoint'

BATCH_SIZE = 512

fold = 1
print(f"Inference for fold {fold}")
# import test data
data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
df = df[df['MDM']].reset_index(drop=True)

# get target data
data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# processing to help with selection later
train_df['thing_property'] = train_df['thing'] + " " + train_df['property']

# assign labels
df['thing_property'] = df['thing'] + " " + df['property']
thing_property = df['thing_property'].to_list()
mdm_list = sorted(list(set(thing_property)))

def generate_labels(df, mdm_list):
    output_list = []
    for _, row in df.iterrows():
        pattern = f"{row['thing_property']}"
        try:
            index = mdm_list.index(pattern)
        except ValueError:
            print("Error: value not found in MDM list")
            index = -1
        output_list.append(index)

    return output_list

df['labels'] = generate_labels(df, mdm_list)

# rank labels by counts
top_1_labels = df['labels'].value_counts()[0:10].index.to_list()

# indices = df[df['labels'].isin(top_1_labels)].index.to_list()
indices = df[df['labels'] == 56].index.to_list()

input_df = df.iloc[indices].reset_index(drop=True)
# indices_2 = df[df['labels'] == 381].index.to_list()
# indices.extend(indices_2)

# %%
input_df

# %%
def run(step):
    # run inference
    # checkpoint
    # Use glob to find matching paths
    checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint-{step}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training


    embedder = Embedder_bert(checkpoint_path)
    embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
    embedder.create_embedding()
    embeddings = embedder.embeddings


    # Example embeddings array
    size = len(embeddings)
    labels = [f'{step}' for i in range(size)]
    return embeddings, labels

# %%
embeddings = []
labels = []
for step in [200, 400, 600, 800]:
    embeds, lbs = (run(step))
    embeddings.append(embeds)
    labels.extend(lbs)


# %%
labels = np.array(labels)
embeddings = torch.cat(embeddings, dim=0)


# %%
# Reducing dimensions with t-SNE
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
embeddings_2d = tsne.fit_transform(embeddings)

# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
# plt.xlim([embeddings_2d[:, 0].min() - 1, embeddings_2d[:, 0].max() + 1])
# plt.ylim([embeddings_2d[:, 1].min() - 1, embeddings_2d[:, 1].max() + 1])
# plt.show()

# %%
# Create a color map from labels to colors
unique_labels = np.unique(labels)
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
label_to_color = dict(zip(unique_labels, colors))


# Plotting
plt.figure(figsize=(8, 6))
for label in unique_labels:
    idx = (labels == label)
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)

plt.title('2D t-SNE Visualization of Embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
# plt.xlim([embeddings_2d[:, 0].min() - 1, embeddings_2d[:, 0].max() + 1])
# plt.ylim([embeddings_2d[:, 1].min() - 1, embeddings_2d[:, 1].max() + 1])
plt.legend(title='Group')
plt.show()

# %%