96 lines
2.2 KiB
Python
96 lines
2.2 KiB
Python
# %%
|
|
import json
|
|
import pandas as pd
|
|
|
|
##########################################
|
|
# %%
|
|
|
|
# Load the JSON file
|
|
data_path = '../esAppMod/tca_entities.json'
|
|
with open(data_path, 'r') as file:
|
|
data = json.load(file)
|
|
|
|
# Initialize an empty list to store the rows
|
|
rows = []
|
|
|
|
# %%
|
|
# Loop through all entities in the JSON
|
|
for entity in data["data"].items():
|
|
entity_data = entity[1]
|
|
entity_id = entity_data['entity_id']
|
|
entity_name = entity_data['entity_name']
|
|
entity_type_id = entity_data['entity_type_id']
|
|
entity_type_name = entity_data['entity_type_name']
|
|
|
|
# Add each mention and its entity_id to the rows list
|
|
rows.append(
|
|
{
|
|
'id': entity_id,
|
|
'name': entity_name,
|
|
'type_id': entity_type_id,
|
|
'type_name': entity_type_name
|
|
})
|
|
|
|
# Create a DataFrame from the rows
|
|
df = pd.DataFrame(rows)
|
|
|
|
# %%
|
|
# df.to_csv('entity.csv', index=False)
|
|
df
|
|
|
|
# %%
|
|
df['type_name'].value_counts()
|
|
# %%
|
|
df['type_id'].value_counts()
|
|
|
|
# %%
|
|
name_list = df['name'].to_list()
|
|
# %%
|
|
name_list
|
|
|
|
# %%
|
|
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
|
import numpy as np
|
|
|
|
# %%
|
|
# Define labels
|
|
labels = name_list
|
|
|
|
# Create a prefix-based distance matrix
|
|
def prefix_distance(label1, label2):
|
|
prefix1 = label1.split()
|
|
prefix2 = label2.split()
|
|
# Find common prefix length
|
|
common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
|
|
# Distance is inversely proportional to common prefix length
|
|
return 1.0 / (common_prefix_length + 1)
|
|
|
|
# Create a pairwise distance matrix
|
|
n = len(labels)
|
|
distance_matrix = np.zeros((n, n))
|
|
for i in range(n):
|
|
for j in range(n):
|
|
distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
|
|
|
|
# Perform hierarchical clustering
|
|
linkage_matrix = linkage(distance_matrix, method='average')
|
|
|
|
# Visualize as a dendrogram
|
|
import matplotlib.pyplot as plt
|
|
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
|
|
plt.title("Prefix-Based Clustering")
|
|
plt.show()
|
|
|
|
# %%
|
|
linkage_matrix
|
|
# %%
|
|
# Extract flat clusters with a distance threshold
|
|
threshold = 0.5
|
|
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
|
|
|
|
# Display clusters
|
|
for i, cluster_id in enumerate(clusters):
|
|
print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
|
|
|
|
# %%
|