# %% import json import pandas as pd ########################################## # %% # Load the JSON file data_path = '../esAppMod/tca_entities.json' with open(data_path, 'r') as file: data = json.load(file) # Initialize an empty list to store the rows rows = [] # %% # Loop through all entities in the JSON for entity in data["data"].items(): entity_data = entity[1] entity_id = entity_data['entity_id'] entity_name = entity_data['entity_name'] entity_type_id = entity_data['entity_type_id'] entity_type_name = entity_data['entity_type_name'] # Add each mention and its entity_id to the rows list rows.append( { 'id': entity_id, 'name': entity_name, 'type_id': entity_type_id, 'type_name': entity_type_name }) # Create a DataFrame from the rows df = pd.DataFrame(rows) # %% # df.to_csv('entity.csv', index=False) df # %% df['type_name'].value_counts() # %% df['type_id'].value_counts() # %% name_list = df['name'].to_list() # %% name_list # %% from scipy.cluster.hierarchy import dendrogram, linkage, fcluster import numpy as np # %% # Define labels labels = name_list # Create a prefix-based distance matrix def prefix_distance(label1, label2): prefix1 = label1.split() prefix2 = label2.split() # Find common prefix length common_prefix_length = len([w1 for w1, w2 in zip(prefix1, prefix2) if w1 == w2]) # Distance is inversely proportional to common prefix length return 1.0 / (common_prefix_length + 1) # Create a pairwise distance matrix n = len(labels) distance_matrix = np.zeros((n, n)) for i in range(n): for j in range(n): distance_matrix[i, j] = prefix_distance(labels[i], labels[j]) # Perform hierarchical clustering linkage_matrix = linkage(distance_matrix, method='average') # Visualize as a dendrogram import matplotlib.pyplot as plt dendrogram(linkage_matrix, labels=labels, leaf_rotation=90, leaf_font_size=2) plt.title("Prefix-Based Clustering") plt.show() # %% linkage_matrix # %% # Extract flat clusters with a distance threshold threshold = 0.5 clusters = fcluster(linkage_matrix, t=threshold, criterion='distance') # Display clusters for i, cluster_id in enumerate(clusters): print(f"Label: {labels[i]}, Cluster ID: {cluster_id}") # %%