Feat: added embedding plot for coarse and fine-grained labels
This commit is contained in:
parent
c64e4bccfc
commit
481bcf88b7
|
@ -45,6 +45,27 @@ def generate_labels(df, mdm_list):
|
||||||
|
|
||||||
df['labels'] = generate_labels(df, mdm_list)
|
df['labels'] = generate_labels(df, mdm_list)
|
||||||
|
|
||||||
|
# pattern labels
|
||||||
|
patterns = df['pattern'].to_list()
|
||||||
|
mdm_pattern_list = sorted(list(set(patterns)))
|
||||||
|
|
||||||
|
def generate_pattern_labels(df, mdm_pattern_list):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
pattern = f"{row['pattern']}"
|
||||||
|
try:
|
||||||
|
index = mdm_pattern_list.index(pattern)
|
||||||
|
except ValueError:
|
||||||
|
print("Error: value not found in MDM list")
|
||||||
|
index = -1
|
||||||
|
output_list.append(index)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# rank labels by counts
|
# rank labels by counts
|
||||||
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
||||||
|
|
||||||
|
@ -52,8 +73,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
|
||||||
|
|
||||||
input_df = df.iloc[indices].reset_index(drop=True)
|
input_df = df.iloc[indices].reset_index(drop=True)
|
||||||
|
|
||||||
# %%
|
|
||||||
input_df
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
def run(step):
|
def run(step):
|
||||||
|
@ -66,12 +85,13 @@ def run(step):
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
embeddings = (run(step=1200))
|
embeddings = (run(step=1200))
|
||||||
labels = input_df['labels']
|
|
||||||
|
|
||||||
# Reducing dimensions with t-SNE
|
# Reducing dimensions with t-SNE
|
||||||
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
||||||
embeddings_2d = tsne.fit_transform(embeddings)
|
embeddings_2d = tsne.fit_transform(embeddings)
|
||||||
|
|
||||||
|
# t-sne plot with complete labels
|
||||||
|
labels = input_df['labels']
|
||||||
|
|
||||||
# Create a color map from labels to colors
|
# Create a color map from labels to colors
|
||||||
unique_labels = np.unique(labels)
|
unique_labels = np.unique(labels)
|
||||||
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
@ -83,10 +103,33 @@ for label in unique_labels:
|
||||||
idx = (labels == label)
|
idx = (labels == label)
|
||||||
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
|
||||||
plt.title('2D t-SNE Visualization of Embeddings')
|
plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
|
||||||
plt.xlabel('Component 1')
|
plt.xlabel('Component 1')
|
||||||
plt.ylabel('Component 2')
|
plt.ylabel('Component 2')
|
||||||
plt.legend(title='Group')
|
plt.legend(title='Group')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# t-sne plot with pattern labels
|
||||||
|
labels = input_df['pattern_labels']
|
||||||
|
|
||||||
|
# Create a color map from labels to colors
|
||||||
|
unique_labels = np.unique(labels)
|
||||||
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
label_to_color = dict(zip(unique_labels, colors))
|
||||||
|
|
||||||
|
# Plotting
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
for label in unique_labels:
|
||||||
|
idx = (labels == label)
|
||||||
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
|
||||||
|
plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
|
||||||
|
plt.xlabel('Component 1')
|
||||||
|
plt.ylabel('Component 2')
|
||||||
|
plt.legend(title='Group')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -45,6 +45,26 @@ def generate_labels(df, mdm_list):
|
||||||
|
|
||||||
df['labels'] = generate_labels(df, mdm_list)
|
df['labels'] = generate_labels(df, mdm_list)
|
||||||
|
|
||||||
|
# pattern labels
|
||||||
|
patterns = df['pattern'].to_list()
|
||||||
|
mdm_pattern_list = sorted(list(set(patterns)))
|
||||||
|
|
||||||
|
def generate_pattern_labels(df, mdm_pattern_list):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
pattern = f"{row['pattern']}"
|
||||||
|
try:
|
||||||
|
index = mdm_pattern_list.index(pattern)
|
||||||
|
except ValueError:
|
||||||
|
print("Error: value not found in MDM list")
|
||||||
|
index = -1
|
||||||
|
output_list.append(index)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
|
||||||
|
|
||||||
|
|
||||||
# rank labels by counts
|
# rank labels by counts
|
||||||
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
||||||
|
|
||||||
|
@ -52,8 +72,6 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
|
||||||
|
|
||||||
input_df = df.iloc[indices].reset_index(drop=True)
|
input_df = df.iloc[indices].reset_index(drop=True)
|
||||||
|
|
||||||
# %%
|
|
||||||
input_df
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
def run(step):
|
def run(step):
|
||||||
|
@ -66,12 +84,13 @@ def run(step):
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
embeddings = (run(step=1200))
|
embeddings = (run(step=1200))
|
||||||
labels = input_df['labels']
|
|
||||||
|
|
||||||
# Reducing dimensions with t-SNE
|
# Reducing dimensions with t-SNE
|
||||||
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
||||||
embeddings_2d = tsne.fit_transform(embeddings)
|
embeddings_2d = tsne.fit_transform(embeddings)
|
||||||
|
|
||||||
|
# t-sne plot with complete labels
|
||||||
|
labels = input_df['labels']
|
||||||
|
|
||||||
# Create a color map from labels to colors
|
# Create a color map from labels to colors
|
||||||
unique_labels = np.unique(labels)
|
unique_labels = np.unique(labels)
|
||||||
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
@ -83,7 +102,28 @@ for label in unique_labels:
|
||||||
idx = (labels == label)
|
idx = (labels == label)
|
||||||
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
|
||||||
plt.title('2D t-SNE Visualization of Embeddings')
|
plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
|
||||||
|
plt.xlabel('Component 1')
|
||||||
|
plt.ylabel('Component 2')
|
||||||
|
plt.legend(title='Group')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# t-sne plot with pattern labels
|
||||||
|
labels = input_df['pattern_labels']
|
||||||
|
|
||||||
|
# Create a color map from labels to colors
|
||||||
|
unique_labels = np.unique(labels)
|
||||||
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
label_to_color = dict(zip(unique_labels, colors))
|
||||||
|
|
||||||
|
# Plotting
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
for label in unique_labels:
|
||||||
|
idx = (labels == label)
|
||||||
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
|
||||||
|
plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
|
||||||
plt.xlabel('Component 1')
|
plt.xlabel('Component 1')
|
||||||
plt.ylabel('Component 2')
|
plt.ylabel('Component 2')
|
||||||
plt.legend(title='Group')
|
plt.legend(title='Group')
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# %%
|
# %%
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
from inference import Embedder_t5
|
from inference import Inference, Embedder_t5_encoder, Embedder_t5_decoder
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.manifold import TSNE
|
from sklearn.manifold import TSNE
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
@ -19,6 +19,7 @@ data_path = f"../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
|
||||||
df = pd.read_csv(data_path, skipinitialspace=True)
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
df = df[df['MDM']].reset_index(drop=True)
|
df = df[df['MDM']].reset_index(drop=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
# get target data
|
# get target data
|
||||||
data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
data_path = f"../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||||
|
@ -45,6 +46,25 @@ def generate_labels(df, mdm_list):
|
||||||
|
|
||||||
df['labels'] = generate_labels(df, mdm_list)
|
df['labels'] = generate_labels(df, mdm_list)
|
||||||
|
|
||||||
|
# pattern labels
|
||||||
|
patterns = df['pattern'].to_list()
|
||||||
|
mdm_pattern_list = sorted(list(set(patterns)))
|
||||||
|
|
||||||
|
def generate_pattern_labels(df, mdm_pattern_list):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
pattern = f"{row['pattern']}"
|
||||||
|
try:
|
||||||
|
index = mdm_pattern_list.index(pattern)
|
||||||
|
except ValueError:
|
||||||
|
print("Error: value not found in MDM list")
|
||||||
|
index = -1
|
||||||
|
output_list.append(index)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
df['pattern_labels'] = generate_pattern_labels(df, mdm_pattern_list)
|
||||||
|
|
||||||
# rank labels by counts
|
# rank labels by counts
|
||||||
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
top_10_labels = df['labels'].value_counts()[0:10].index.to_list()
|
||||||
|
|
||||||
|
@ -52,10 +72,11 @@ indices = df[df['labels'].isin(top_10_labels)].index.to_list()
|
||||||
|
|
||||||
input_df = df.iloc[indices].reset_index(drop=True)
|
input_df = df.iloc[indices].reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
def run(step):
|
def run(step):
|
||||||
checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
||||||
embedder = Embedder_t5(checkpoint_path)
|
embedder = Embedder_t5_encoder(checkpoint_path)
|
||||||
embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
||||||
embedder.create_embedding()
|
embedder.create_embedding()
|
||||||
embeddings = embedder.embeddings
|
embeddings = embedder.embeddings
|
||||||
|
@ -63,12 +84,13 @@ def run(step):
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
embeddings = (run(step=1200))
|
embeddings = (run(step=1200))
|
||||||
labels = input_df['labels']
|
|
||||||
|
|
||||||
# Reducing dimensions with t-SNE
|
# Reducing dimensions with t-SNE
|
||||||
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
||||||
embeddings_2d = tsne.fit_transform(embeddings)
|
embeddings_2d = tsne.fit_transform(embeddings)
|
||||||
|
|
||||||
|
# t-sne plot with complete labels
|
||||||
|
labels = input_df['labels']
|
||||||
|
|
||||||
# Create a color map from labels to colors
|
# Create a color map from labels to colors
|
||||||
unique_labels = np.unique(labels)
|
unique_labels = np.unique(labels)
|
||||||
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
@ -80,10 +102,118 @@ for label in unique_labels:
|
||||||
idx = (labels == label)
|
idx = (labels == label)
|
||||||
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
|
||||||
plt.title('2D t-SNE Visualization of Embeddings')
|
plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
|
||||||
plt.xlabel('Component 1')
|
plt.xlabel('Component 1')
|
||||||
plt.ylabel('Component 2')
|
plt.ylabel('Component 2')
|
||||||
plt.legend(title='Group')
|
plt.legend(title='Group')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# t-sne plot with pattern labels
|
||||||
|
labels = input_df['pattern_labels']
|
||||||
|
|
||||||
|
# Create a color map from labels to colors
|
||||||
|
unique_labels = np.unique(labels)
|
||||||
|
colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
label_to_color = dict(zip(unique_labels, colors))
|
||||||
|
|
||||||
|
# Plotting
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
for label in unique_labels:
|
||||||
|
idx = (labels == label)
|
||||||
|
plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
|
||||||
|
plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
|
||||||
|
plt.xlabel('Component 1')
|
||||||
|
plt.ylabel('Component 2')
|
||||||
|
plt.legend(title='Group')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
##############################################
|
||||||
|
# %%
|
||||||
|
# demonstrate decoding to correct output
|
||||||
|
step = 1200
|
||||||
|
checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
||||||
|
infer = Inference(checkpoint_path)
|
||||||
|
infer.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
||||||
|
thing_prediction_list, property_prediction_list = infer.generate()
|
||||||
|
|
||||||
|
# add labels too
|
||||||
|
# thing_actual_list, property_actual_list = decode_preds(pred_labels)
|
||||||
|
# Convert the list to a Pandas DataFrame
|
||||||
|
df_out = pd.DataFrame({
|
||||||
|
'p_thing': thing_prediction_list,
|
||||||
|
'p_property': property_prediction_list
|
||||||
|
})
|
||||||
|
# df_out['p_thing_correct'] = df_out['p_thing'] == df_out['thing']
|
||||||
|
# df_out['p_property_correct'] = df_out['p_property'] == df_out['property']
|
||||||
|
input_df = pd.concat([input_df, df_out], axis=1)
|
||||||
|
|
||||||
|
condition_correct_thing = input_df['p_thing'] == input_df['thing']
|
||||||
|
condition_correct_property = input_df['p_property'] == input_df['property']
|
||||||
|
prediction_mdm_correct = sum(condition_correct_thing & condition_correct_property)
|
||||||
|
pred_correct_proportion = prediction_mdm_correct/len(input_df)
|
||||||
|
print(pred_correct_proportion)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
input_df[['thing', 'p_thing', 'property', 'p_property']]
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# def run(step):
|
||||||
|
# checkpoint_path = os.path.join(checkpoint_directory, f'checkpoint_{step}')
|
||||||
|
# embedder = Embedder_t5_decoder(checkpoint_path)
|
||||||
|
# embedder.prepare_dataloader(input_df, batch_size=BATCH_SIZE, max_length=128)
|
||||||
|
# embedder.create_embedding()
|
||||||
|
# embeddings = embedder.embeddings
|
||||||
|
# return embeddings
|
||||||
|
#
|
||||||
|
# # %%
|
||||||
|
# embeddings = (run(step=1200))
|
||||||
|
# # Reducing dimensions with t-SNE
|
||||||
|
# tsne = TSNE(n_components=2, random_state=0, perplexity=5)
|
||||||
|
# embeddings_2d = tsne.fit_transform(embeddings)
|
||||||
|
#
|
||||||
|
# # t-sne plot with complete labels
|
||||||
|
# labels = input_df['labels']
|
||||||
|
#
|
||||||
|
# # Create a color map from labels to colors
|
||||||
|
# unique_labels = np.unique(labels)
|
||||||
|
# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
# label_to_color = dict(zip(unique_labels, colors))
|
||||||
|
#
|
||||||
|
# # Plotting
|
||||||
|
# plt.figure(figsize=(8, 6))
|
||||||
|
# for label in unique_labels:
|
||||||
|
# idx = (labels == label)
|
||||||
|
# plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
#
|
||||||
|
# plt.title('2D t-SNE Visualization of Embeddings of fine-grained labels')
|
||||||
|
# plt.xlabel('Component 1')
|
||||||
|
# plt.ylabel('Component 2')
|
||||||
|
# plt.legend(title='Group')
|
||||||
|
# plt.show()
|
||||||
|
#
|
||||||
|
# # %%
|
||||||
|
# # t-sne plot with pattern labels
|
||||||
|
# labels = input_df['pattern_labels']
|
||||||
|
#
|
||||||
|
# # Create a color map from labels to colors
|
||||||
|
# unique_labels = np.unique(labels)
|
||||||
|
# colors = plt.cm.jet(np.linspace(0, 1, len(unique_labels)))
|
||||||
|
# label_to_color = dict(zip(unique_labels, colors))
|
||||||
|
#
|
||||||
|
# # Plotting
|
||||||
|
# plt.figure(figsize=(8, 6))
|
||||||
|
# for label in unique_labels:
|
||||||
|
# idx = (labels == label)
|
||||||
|
# plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], color=label_to_color[label], label=label, alpha=0.7)
|
||||||
|
#
|
||||||
|
# plt.title('2D t-SNE Visualization of Embeddings of coarse-grained labels')
|
||||||
|
# plt.xlabel('Component 1')
|
||||||
|
# plt.ylabel('Component 2')
|
||||||
|
# plt.legend(title='Group')
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -14,6 +14,7 @@ import numpy as np
|
||||||
|
|
||||||
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
||||||
|
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
class Inference():
|
class Inference():
|
||||||
tokenizer: T5TokenizerFast
|
tokenizer: T5TokenizerFast
|
||||||
|
@ -169,8 +170,136 @@ class Inference():
|
||||||
thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
|
thing_prediction_list, property_prediction_list = decode_preds(pred_generations)
|
||||||
return thing_prediction_list, property_prediction_list
|
return thing_prediction_list, property_prediction_list
|
||||||
|
|
||||||
|
class Embedder_t5_decoder():
|
||||||
|
tokenizer: T5TokenizerFast
|
||||||
|
model: torch.nn.Module
|
||||||
|
dataloader: DataLoader
|
||||||
|
embeddings: list
|
||||||
|
|
||||||
class Embedder_t5():
|
def __init__(self, checkpoint_path):
|
||||||
|
self._create_tokenizer()
|
||||||
|
self._load_model(checkpoint_path)
|
||||||
|
self.embeddings = []
|
||||||
|
|
||||||
|
|
||||||
|
def _create_tokenizer(self):
|
||||||
|
# %%
|
||||||
|
# load tokenizer
|
||||||
|
self.tokenizer = T5TokenizerFast.from_pretrained("t5-small", return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||||
|
# Define additional special tokens
|
||||||
|
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "SIG", "UNIT", "DATA_TYPE"]
|
||||||
|
# Add the additional special tokens to the tokenizer
|
||||||
|
self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||||
|
|
||||||
|
def _load_model(self, checkpoint_path: str):
|
||||||
|
# load model
|
||||||
|
# Define the directory and the pattern
|
||||||
|
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
|
||||||
|
model = torch.compile(model)
|
||||||
|
# set model to eval
|
||||||
|
self.model = model.eval()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_dataloader(self, input_df, batch_size, max_length):
|
||||||
|
"""
|
||||||
|
*arguments*
|
||||||
|
- input_df: input dataframe containing fields 'tag_description', 'thing', 'property'
|
||||||
|
- batch_size: the batch size of dataloader output
|
||||||
|
- max_length: length of tokenizer output
|
||||||
|
"""
|
||||||
|
print("preparing dataloader")
|
||||||
|
# convert each dataframe row into a dictionary
|
||||||
|
# outputs a list of dictionaries
|
||||||
|
|
||||||
|
def _process_df(df):
|
||||||
|
output_list = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
desc = f"<DESC>{row['tag_description']}<DESC>"
|
||||||
|
unit = f"<UNIT>{row['unit']}<UNIT>"
|
||||||
|
element = {
|
||||||
|
'input' : f"{desc}{unit}",
|
||||||
|
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
|
||||||
|
}
|
||||||
|
output_list.append(element)
|
||||||
|
|
||||||
|
return output_list
|
||||||
|
|
||||||
|
def _preprocess_function(example):
|
||||||
|
input = example['input']
|
||||||
|
target = example['output']
|
||||||
|
# text_target sets the corresponding label to inputs
|
||||||
|
# there is no need to create a separate 'labels'
|
||||||
|
model_inputs = self.tokenizer(
|
||||||
|
input,
|
||||||
|
text_target=target,
|
||||||
|
max_length=max_length,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding='max_length',
|
||||||
|
truncation=True,
|
||||||
|
)
|
||||||
|
return model_inputs
|
||||||
|
|
||||||
|
test_dataset = Dataset.from_list(_process_df(input_df))
|
||||||
|
|
||||||
|
|
||||||
|
# map maps function to each "row" in the dataset
|
||||||
|
# aka the data in the immediate nesting
|
||||||
|
datasets = test_dataset.map(
|
||||||
|
_preprocess_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=1,
|
||||||
|
remove_columns=test_dataset.column_names,
|
||||||
|
)
|
||||||
|
# datasets = _preprocess_function(test_dataset)
|
||||||
|
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
||||||
|
|
||||||
|
# create dataloader
|
||||||
|
self.dataloader = DataLoader(datasets, batch_size=batch_size)
|
||||||
|
|
||||||
|
|
||||||
|
def create_embedding(self):
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
pred_labels = []
|
||||||
|
|
||||||
|
for batch in tqdm(self.dataloader):
|
||||||
|
# Inference in batches
|
||||||
|
input_ids = batch['input_ids']
|
||||||
|
attention_mask = batch['attention_mask']
|
||||||
|
# save labels too
|
||||||
|
pred_labels.extend(batch['labels'])
|
||||||
|
|
||||||
|
|
||||||
|
# Move to GPU if available
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
# Manually create the decoder input (start token)
|
||||||
|
decoder_input_ids = self.tokenizer("<pad>", return_tensors="pt").input_ids
|
||||||
|
decoder_input_ids = torch.full((input_ids.size(0), len(decoder_input_ids)), self.model.config.decoder_start_token_id, dtype=torch.long).to(input_ids.device)
|
||||||
|
|
||||||
|
attention_mask = attention_mask.to(device)
|
||||||
|
self.model.to(device)
|
||||||
|
|
||||||
|
# Perform inference
|
||||||
|
with torch.no_grad():
|
||||||
|
# encoder_outputs = self.model.encoder(
|
||||||
|
# input_ids,
|
||||||
|
# attention_mask=attention_mask)
|
||||||
|
# # Use the hidden state of the first token as the sequence representation
|
||||||
|
# # pooled_output = encoder_outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
|
||||||
|
# outputs = self.model.decoder(
|
||||||
|
# input_ids=decoder_input_ids,
|
||||||
|
# encoder_hidden_states=encoder_outputs.last_hidden_state)
|
||||||
|
outputs = self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
|
||||||
|
first_token_logits = outputs.decoder_hidden_states[-1][:,-1,:]
|
||||||
|
self.embeddings.append(first_token_logits.to('cpu'))
|
||||||
|
|
||||||
|
|
||||||
|
self.embeddings = torch.cat(self.embeddings, dim=0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Embedder_t5_encoder():
|
||||||
tokenizer: T5TokenizerFast
|
tokenizer: T5TokenizerFast
|
||||||
model: torch.nn.Module
|
model: torch.nn.Module
|
||||||
dataloader: DataLoader
|
dataloader: DataLoader
|
||||||
|
|
Loading…
Reference in New Issue