134 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			134 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
| # this code tries to analyze the embeddings of the encoder
 | |
| # %%
 | |
| import pandas as pd
 | |
| import os
 | |
| import glob
 | |
| from inference import Embedder_bert
 | |
| import numpy as np
 | |
| from sklearn.manifold import TSNE
 | |
| import matplotlib.pyplot as plt
 | |
| import torch
 | |
| from sklearn.preprocessing import StandardScaler
 | |
| 
 | |
| 
 | |
| checkpoint_directory =  'classification_bert_complete_desc_unit/checkpoint'
 | |
| 
 | |
| BATCH_SIZE = 512
 | |
| 
 | |
| fold = 1
 | |
| print(f"Inference for fold {fold}")
 | |
| # import test data
 | |
| data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
 | |
| df = pd.read_csv(data_path, skipinitialspace=True)
 | |
| df = df[df['MDM']].reset_index(drop=True)
 | |
| 
 | |
| # get target data
 | |
| data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
 | |
| train_df = pd.read_csv(data_path, skipinitialspace=True)
 | |
| # processing to help with selection later
 | |
| train_df['thing_property'] = train_df['thing'] + " " + train_df['property']
 | |
| 
 | |
| # assign labels
 | |
| df['thing_property'] = df['thing'] + " " + df['property']
 | |
| thing_property = df['thing_property'].to_list()
 | |
| mdm_list = sorted(list(set(thing_property)))
 | |
| 
 | |
| def generate_labels(df, mdm_list):
 | |
|     output_list = []
 | |
|     for _, row in df.iterrows():
 | |
|         pattern = f"{row['thing_property']}"
 | |
|         try:
 | |
|             index = mdm_list.index(pattern)
 | |
|         except ValueError:
 | |
|             print("Error: value not found in MDM list")
 | |
|             index = -1
 | |
|         output_list.append(index)
 | |
| 
 | |
|     return output_list
 | |
| 
 | |
| df['labels'] = generate_labels(df, mdm_list)
 | |
| 
 | |
| # rank labels by counts
 | |
| top_1_labels = df['labels'].value_counts()[0:10].index.to_list()
 | |
| 
 | |
| # indices = df[df['labels'].isin(top_1_labels)].index.to_list()
 | |
| indices = df[df['labels'] == 56].index.to_list()
 | |
| 
 | |
| input_df = df.iloc[indices].reset_index(drop=True)
 | |
| # indices_2 = df[df['labels'] == 381].index.to_list()
 | |
| # indices.extend(indices_2)
 | |
| 
 | |
| # %%
 | |
| input_df
 | |
| 
 | |
| # %%
 | |
| def run(step):
 | |
|     # run inference
 | |
|     # checkpoint
 | |
|     # Use glob to find matching paths
 | |
|     checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint-{step}')
 | |
|     # Use glob to find matching paths
 | |
|     # path is usually checkpoint_fold_1/checkpoint-<step number>
 | |
|     # we are guaranteed to save only 1 checkpoint from training
 | |
| 
 | |
| 
 | |
|     embedder = Embedder_bert(checkpoint_path)
 | |
|     embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
 | |
|     embedder.create_embedding()
 | |
|     embeddings = embedder.embeddings
 | |
| 
 | |
| 
 | |
|     # Example embeddings array
 | |
|     size = len(embeddings)
 | |
|     labels = [f'{step}' for i in range(size)]
 | |
|     return embeddings, labels 
 | |
| 
 | |
| # %%
 | |
| embeddings = []
 | |
| labels = []
 | |
| for step in [200, 400, 600, 800]:
 | |
|     embeds, lbs = (run(step))
 | |
|     embeddings.append(embeds)
 | |
|     labels.extend(lbs)
 | |
| 
 | |
| 
 | |
| # %%
 | |
| labels = np.array(labels)
 | |
| embeddings = torch.cat(embeddings, dim=0)
 | |
| 
 | |
| 
 | |
| 
 | |
| # %%
 | |
| # Reducing dimensions with t-SNE
 | |
| tsne = TSNE(n_components=2, random_state=0, perplexity=5)
 | |
| embeddings_2d = tsne.fit_transform(embeddings)
 | |
| 
 | |
| # plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
 | |
| # plt.xlim([embeddings_2d[:, 0].min() - 1, embeddings_2d[:, 0].max() + 1])
 | |
| # plt.ylim([embeddings_2d[:, 1].min() - 1, embeddings_2d[:, 1].max() + 1])
 | |
| # plt.show()
 | |
| 
 | |
| # %%
 | |
| # Create a color map from labels to colors
 | |
| unique_labels = np.unique(labels)
 | |
| colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
 | |
| label_to_color = dict(zip(unique_labels, colors))
 | |
| 
 | |
| 
 | |
| 
 | |
| # Plotting
 | |
| plt.figure(figsize=(8, 6))
 | |
| for label in unique_labels:
 | |
|     idx = (labels == label)
 | |
|     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
 | |
| 
 | |
| plt.title('2D t-SNE Visualization of Embeddings')
 | |
| plt.xlabel('Component 1')
 | |
| plt.ylabel('Component 2')
 | |
| # plt.xlim([embeddings_2d[:, 0].min() - 1, embeddings_2d[:, 0].max() + 1])
 | |
| # plt.ylim([embeddings_2d[:, 1].min() - 1, embeddings_2d[:, 1].max() + 1])
 | |
| plt.legend(title='Group')
 | |
| plt.show()
 | |
| 
 | |
| # %%
 |