domain_mapping/analysis/entity_hierarchy.py

# %%
import json
import pandas as pd

##########################################
# %%

# Load the JSON file
data_path = '../esAppMod/tca_entities.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Initialize an empty list to store the rows
rows = []

# %%
# Loop through all entities in the JSON
for entity in data["data"].items():
    entity_data = entity[1]
    entity_id = entity_data['entity_id']
    entity_name = entity_data['entity_name']
    entity_type_id = entity_data['entity_type_id']
    entity_type_name = entity_data['entity_type_name']

    # Add each mention and its entity_id to the rows list
    rows.append(
        {
        'id': entity_id,
        'name': entity_name,
        'type_id': entity_type_id,
        'type_name': entity_type_name
        })

# Create a DataFrame from the rows
df = pd.DataFrame(rows)

# %%
# df.to_csv('entity.csv', index=False)
df

# %%
df['type_name'].value_counts()
# %%
df['type_id'].value_counts()

# %%
name_list = df['name'].to_list()
# %%
name_list

# %%
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import numpy as np

# %%
# Define labels
labels = name_list

# Create a prefix-based distance matrix
def prefix_distance(label1, label2):
    prefix1 = label1.split()
    prefix2 = label2.split()
    # Find common prefix length
    common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2])
    # Distance is inversely proportional to common prefix length
    return 1.0 / (common_prefix_length + 1)

# Create a pairwise distance matrix
n = len(labels)
distance_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        distance_matrix[i, j] = prefix_distance(labels[i], labels[j])

# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='average')

# Visualize as a dendrogram
import matplotlib.pyplot as plt
dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2)
plt.title("Prefix-Based Clustering")
plt.show()

# %%
linkage_matrix
# %%
# Extract flat clusters with a distance threshold
threshold = 0.5
clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')

# Display clusters
for i, cluster_id in enumerate(clusters):
    print(f"Label: {labels[i]}, Cluster ID: {cluster_id}")

# %%