81 lines
2.1 KiB
Python
81 lines
2.1 KiB
Python
|
# %%
|
||
|
import json
|
||
|
import pandas as pd
|
||
|
from utils import Retriever, cosine_similarity_chunked
|
||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
|
||
|
##########################################
|
||
|
# %%
|
||
|
|
||
|
# Load the JSON file
|
||
|
data_path = '../esAppMod/tca_entities.json'
|
||
|
with open(data_path, 'r') as file:
|
||
|
data = json.load(file)
|
||
|
|
||
|
# Initialize an empty list to store the rows
|
||
|
rows = []
|
||
|
|
||
|
# %%
|
||
|
# Loop through all entities in the JSON
|
||
|
for entity in data["data"].items():
|
||
|
entity_data = entity[1]
|
||
|
entity_id = entity_data['entity_id']
|
||
|
entity_name = entity_data['entity_name']
|
||
|
|
||
|
# Add each mention and its entity_id to the rows list
|
||
|
rows.append({"id": entity_id, "name": entity_name})
|
||
|
|
||
|
# Create a DataFrame from the rows
|
||
|
df = pd.DataFrame(rows)
|
||
|
|
||
|
|
||
|
# %%
|
||
|
# df.to_csv('entity.csv', index=False)
|
||
|
|
||
|
|
||
|
# %%
|
||
|
# we want to automatically identify clusters
|
||
|
class Embedder():
|
||
|
input_df: pd.DataFrame
|
||
|
fold: int
|
||
|
|
||
|
def __init__(self, input_df):
|
||
|
self.input_df = input_df
|
||
|
|
||
|
|
||
|
def make_embedding(self, checkpoint_path):
|
||
|
|
||
|
def generate_input_list(df):
|
||
|
input_list = []
|
||
|
for _, row in df.iterrows():
|
||
|
desc = row['name']
|
||
|
input_list.append(desc)
|
||
|
return input_list
|
||
|
|
||
|
# prepare reference embed
|
||
|
train_data = list(generate_input_list(self.input_df))
|
||
|
# Define the directory and the pattern
|
||
|
retriever_train = Retriever(train_data, checkpoint_path)
|
||
|
retriever_train.make_embedding(batch_size=64)
|
||
|
return retriever_train.embeddings.to('cpu')
|
||
|
|
||
|
# model_checkpoint = 'google-bert/bert-base-cased'
|
||
|
model_checkpoint = '../train/class_bert_simple/checkpoint/checkpoint-4500'
|
||
|
embedder = Embedder(input_df=df)
|
||
|
embeddings = embedder.make_embedding(model_checkpoint)
|
||
|
|
||
|
# %%
|
||
|
similarity_matrix = cosine_similarity(embeddings)
|
||
|
|
||
|
# %%
|
||
|
similarity_matrix.shape
|
||
|
|
||
|
# %%
|
||
|
from sklearn.cluster import AgglomerativeClustering
|
||
|
|
||
|
clustering = AgglomerativeClustering(metric='precomputed', linkage='average')
|
||
|
clustering.fit(1 - similarity_matrix) # Use distance = 1 - similarity
|
||
|
|
||
|
print(clustering.labels_) # Cluster assignments
|
||
|
# %%
|