220 lines
6.8 KiB
Python
220 lines
6.8 KiB
Python
# this code tries to analyze the embeddings of the encoder
|
|
# %%
|
|
import pandas as pd
|
|
import os
|
|
from inference import Inference, Embedder_t5_encoder
|
|
import numpy as np
|
|
from sklearn.manifold import TSNE
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
checkpoint_directory = 'mapping_t5_complete_desc_unit/checkpoint'
|
|
|
|
BATCH_SIZE = 512
|
|
|
|
fold = 1
|
|
print(f"Inference for fold {fold}")
|
|
# import test data
|
|
data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
|
df = df[df['MDM']].reset_index(drop=True)
|
|
|
|
# %%
|
|
# get target data
|
|
data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
# processing to help with selection later
|
|
train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
|
|
|
|
# assign labels
|
|
df['thing_property'] = df['thing'] + " " + df['property']
|
|
thing_property = df['thing_property'].to_list()
|
|
mdm_list = sorted(list(set(thing_property)))
|
|
|
|
def generate_labels(df, mdm_list):
|
|
output_list = []
|
|
for _, row in df.iterrows():
|
|
pattern = f"{row['thing_property']}"
|
|
try:
|
|
index = mdm_list.index(pattern)
|
|
except ValueError:
|
|
print("Error: value not found in MDM list")
|
|
index = -1
|
|
output_list.append(index)
|
|
|
|
return output_list
|
|
|
|
df['labels'] = generate_labels(df, mdm_list)
|
|
|
|
# pattern labels
|
|
patterns = df['pattern'].to_list()
|
|
mdm_pattern_list = sorted(list(set(patterns)))
|
|
|
|
def generate_pattern_labels(df, mdm_pattern_list):
|
|
output_list = []
|
|
for _, row in df.iterrows():
|
|
pattern = f"{row['pattern']}"
|
|
try:
|
|
index = mdm_pattern_list.index(pattern)
|
|
except ValueError:
|
|
print("Error: value not found in MDM list")
|
|
index = -1
|
|
output_list.append(index)
|
|
|
|
return output_list
|
|
|
|
df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
|
|
|
|
# rank labels by counts
|
|
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
|
|
|
indices = df[df['labels'].isin(top_10_labels)].index.to_list()
|
|
|
|
input_df = df.iloc[indices].reset_index(drop=True)
|
|
|
|
|
|
# %%
|
|
def run(step):
|
|
checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
|
embedder = Embedder_t5_encoder(checkpoint_path)
|
|
embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
|
embedder.create_embedding()
|
|
embeddings = embedder.embeddings
|
|
return embeddings
|
|
|
|
# %%
|
|
embeddings = (run(step=1200))
|
|
# Reducing dimensions with t-SNE
|
|
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
|
embeddings_2d = tsne.fit_transform(embeddings)
|
|
|
|
# t-sne plot with complete labels
|
|
labels = input_df['labels']
|
|
|
|
# Create a color map from labels to colors
|
|
unique_labels = np.unique(labels)
|
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
|
label_to_color = dict(zip(unique_labels, colors))
|
|
|
|
# Plotting
|
|
plt.figure(figsize=(8, 6))
|
|
for label in unique_labels:
|
|
idx = (labels == label)
|
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
|
|
|
plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
|
|
plt.xlabel('Component 1')
|
|
plt.ylabel('Component 2')
|
|
plt.legend(title='Group')
|
|
plt.show()
|
|
|
|
# %%
|
|
# t-sne plot with pattern labels
|
|
labels = input_df['pattern_labels']
|
|
|
|
# Create a color map from labels to colors
|
|
unique_labels = np.unique(labels)
|
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
|
label_to_color = dict(zip(unique_labels, colors))
|
|
|
|
# Plotting
|
|
plt.figure(figsize=(8, 6))
|
|
for label in unique_labels:
|
|
idx = (labels == label)
|
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
|
|
|
plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
|
|
plt.xlabel('Component 1')
|
|
plt.ylabel('Component 2')
|
|
plt.legend(title='Group')
|
|
plt.show()
|
|
|
|
##############################################
|
|
# %%
|
|
# demonstrate decoding to correct output
|
|
step = 1200
|
|
checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
|
infer = Inference(checkpoint_path)
|
|
infer.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
|
thing_prediction_list, property_prediction_list = infer.generate()
|
|
|
|
# add labels too
|
|
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
|
# Convert the list to a Pandas DataFrame
|
|
df_out = pd.DataFrame({
|
|
'p_thing': thing_prediction_list,
|
|
'p_property': property_prediction_list
|
|
})
|
|
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
|
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
|
input_df = pd.concat([input_df, df_out], axis=1)
|
|
|
|
condition_correct_thing = input_df['p_thing'] == input_df['thing']
|
|
condition_correct_property = input_df['p_property'] == input_df['property']
|
|
prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property)
|
|
pred_correct_proportion = prediction_mdm_correct/len(input_df)
|
|
print(pred_correct_proportion)
|
|
|
|
# %%
|
|
input_df[['thing', 'p_thing', 'property', 'p_property']]
|
|
|
|
|
|
# %%
|
|
# def run(step):
|
|
# checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
|
# embedder = Embedder_t5_decoder(checkpoint_path)
|
|
# embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
|
# embedder.create_embedding()
|
|
# embeddings = embedder.embeddings
|
|
# return embeddings
|
|
#
|
|
# # %%
|
|
# embeddings = (run(step=1200))
|
|
# # Reducing dimensions with t-SNE
|
|
# tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
|
# embeddings_2d = tsne.fit_transform(embeddings)
|
|
#
|
|
# # t-sne plot with complete labels
|
|
# labels = input_df['labels']
|
|
#
|
|
# # Create a color map from labels to colors
|
|
# unique_labels = np.unique(labels)
|
|
# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
|
# label_to_color = dict(zip(unique_labels, colors))
|
|
#
|
|
# # Plotting
|
|
# plt.figure(figsize=(8, 6))
|
|
# for label in unique_labels:
|
|
# idx = (labels == label)
|
|
# plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
|
#
|
|
# plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
|
|
# plt.xlabel('Component 1')
|
|
# plt.ylabel('Component 2')
|
|
# plt.legend(title='Group')
|
|
# plt.show()
|
|
#
|
|
# # %%
|
|
# # t-sne plot with pattern labels
|
|
# labels = input_df['pattern_labels']
|
|
#
|
|
# # Create a color map from labels to colors
|
|
# unique_labels = np.unique(labels)
|
|
# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
|
# label_to_color = dict(zip(unique_labels, colors))
|
|
#
|
|
# # Plotting
|
|
# plt.figure(figsize=(8, 6))
|
|
# for label in unique_labels:
|
|
# idx = (labels == label)
|
|
# plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
|
#
|
|
# plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
|
|
# plt.xlabel('Component 1')
|
|
# plt.ylabel('Component 2')
|
|
# plt.legend(title='Group')
|
|
# plt.show()
|
|
|
|
|
|
# %%
|