domain_mapping/analysis/entity_hierarchy.py

96 lines
2.2 KiB
Python

# %%
import json
import pandas as pd
##########################################
# %%
# Load the JSON file
data_path = '../esAppMod/tca_entities.json'
with open(data_path, 'r') as file:
data = json.load(file)
# Initialize an empty list to store the rows
rows = []
# %%
# Loop through all entities in the JSON
for entity in data["data"].items():
entity_data = entity[1]
entity_id = entity_data['entity_id']
entity_name = entity_data['entity_name']
entity_type_id = entity_data['entity_type_id']
entity_type_name = entity_data['entity_type_name']
# Add each mention and its entity_id to the rows list
rows.append(
{
'id': entity_id,
'name': entity_name,
'type_id': entity_type_id,
'type_name': entity_type_name
})
# Create a DataFrame from the rows
df = pd.DataFrame(rows)
# %%
# df.to_csv('entity.csv', index=False)
df
# %%
df['type_name'].value_counts()
# %%
df['type_id'].value_counts()
# %%
name_list = df['name'].to_list()
# %%
name_list
# %%
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import numpy as np
# %%
# Define labels
labels = name_list
# Create a prefix-based distance matrix
def prefix_distance(label1, label2):
prefix1 = label1.split()
prefix2 = label2.split()
# Find common prefix length
common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
# Distance is inversely proportional to common prefix length
return 1.0 / (common_prefix_length + 1)
# Create a pairwise distance matrix
n = len(labels)
distance_matrix = np.zeros((n, n))
for i in range(n):
for j in range(n):
distance_matrix[i, j] = prefix_distance(labels[i], labels[j])
# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='average')
# Visualize as a dendrogram
import matplotlib.pyplot as plt
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
plt.title("Prefix-Based Clustering")
plt.show()
# %%
linkage_matrix
# %%
# Extract flat clusters with a distance threshold
threshold = 0.5
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
# Display clusters
for i, cluster_id in enumerate(clusters):
print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")
# %%