# this code tries to analyze the embeddings of the encoder # %% import pandas as pd import os from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder import numpy as np from sklearn.manifold import TSNE import matplotlib.pyplot as plt checkpoint_directory = 'mapping_t5_complete_desc_unit/checkpoint' BATCH_SIZE = 512 fold = 1 print(f"Inference for fold {fold}") # import test data data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv" df = pd.read_csv(data_path, skipinitialspace=True) df = df[df['MDM']].reset_index(drop=True) # %% # get target data data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv" train_df = pd.read_csv(data_path, skipinitialspace=True) # processing to help with selection later train_df['thing_property'] = train_df['thing'] + " " + train_df['property'] # assign labels df['thing_property'] = df['thing'] + " " + df['property'] thing_property = df['thing_property'].to_list() mdm_list = sorted(list(set(thing_property))) def generate_labels(df, mdm_list): output_list = [] for _, row in df.iterrows(): pattern = f"{row['thing_property']}" try: index = mdm_list.index(pattern) except ValueError: print("Error: value not found in MDM list") index = -1 output_list.append(index) return output_list df['labels'] = generate_labels(df, mdm_list) # pattern labels patterns = df['pattern'].to_list() mdm_pattern_list = sorted(list(set(patterns))) def generate_pattern_labels(df, mdm_pattern_list): output_list = [] for _, row in df.iterrows(): pattern = f"{row['pattern']}" try: index = mdm_pattern_list.index(pattern) except ValueError: print("Error: value not found in MDM list") index = -1 output_list.append(index) return output_list df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list) # rank labels by counts top_10_labels = df['labels'].value_counts()[0:10].index.to_list() indices = df[df['labels'].isin(top_10_labels)].index.to_list() input_df = df.iloc[indices].reset_index(drop=True) # %% def run(step): checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') embedder = Embedder_t5_encoder(checkpoint_path) embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) embedder.create_embedding() embeddings = embedder.embeddings return embeddings # %% embeddings = (run(step=1200)) # Reducing dimensions with t-SNE tsne = TSNE(n_components=2, random_state=0, perplexity=5) embeddings_2d = tsne.fit_transform(embeddings) # t-sne plot with complete labels labels = input_df['labels'] # Create a color map from labels to colors unique_labels = np.unique(labels) colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) label_to_color = dict(zip(unique_labels, colors)) # Plotting plt.figure(figsize=(8, 6)) for label in unique_labels: idx = (labels == label) plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.legend(title='Group') plt.show() # %% # t-sne plot with pattern labels labels = input_df['pattern_labels'] # Create a color map from labels to colors unique_labels = np.unique(labels) colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) label_to_color = dict(zip(unique_labels, colors)) # Plotting plt.figure(figsize=(8, 6)) for label in unique_labels: idx = (labels == label) plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.legend(title='Group') plt.show() ############################################## # %% # demonstrate decoding to correct output step = 1200 checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') infer = Inference(checkpoint_path) infer.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) thing_prediction_list, property_prediction_list = infer.generate() # add labels too # thing_actual_list, property_actual_list = decode_preds(pred_labels) # Convert the list to a Pandas DataFrame df_out = pd.DataFrame({ 'p_thing': thing_prediction_list, 'p_property': property_prediction_list }) # df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing'] # df_out['p_property_correct'] = df_out['p_property'] == df_out['property'] input_df = pd.concat([input_df, df_out], axis=1) condition_correct_thing = input_df['p_thing'] == input_df['thing'] condition_correct_property = input_df['p_property'] == input_df['property'] prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property) pred_correct_proportion = prediction_mdm_correct/len(input_df) print(pred_correct_proportion) # %% input_df[['thing', 'p_thing', 'property', 'p_property']] # %% # def run(step): # checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}') # embedder = Embedder_t5_decoder(checkpoint_path) # embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128) # embedder.create_embedding() # embeddings = embedder.embeddings # return embeddings # # # %% # embeddings = (run(step=1200)) # # Reducing dimensions with t-SNE # tsne = TSNE(n_components=2, random_state=0, perplexity=5) # embeddings_2d = tsne.fit_transform(embeddings) # # # t-sne plot with complete labels # labels = input_df['labels'] # # # Create a color map from labels to colors # unique_labels = np.unique(labels) # colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) # label_to_color = dict(zip(unique_labels, colors)) # # # Plotting # plt.figure(figsize=(8, 6)) # for label in unique_labels: # idx = (labels == label) # plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) # # plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels') # plt.xlabel('Component 1') # plt.ylabel('Component 2') # plt.legend(title='Group') # plt.show() # # # %% # # t-sne plot with pattern labels # labels = input_df['pattern_labels'] # # # Create a color map from labels to colors # unique_labels = np.unique(labels) # colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels))) # label_to_color = dict(zip(unique_labels, colors)) # # # Plotting # plt.figure(figsize=(8, 6)) # for label in unique_labels: # idx = (labels == label) # plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7) # # plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels') # plt.xlabel('Component 1') # plt.ylabel('Component 2') # plt.legend(title='Group') # plt.show() # %%