# %% import pandas as pd import Levenshtein import numpy as np import matplotlib.pyplot as plt from tqdm import tqdm # %% data_path = '../../data_import/exports/data_mapping_mdm.csv' df = pd.read_csv(data_path, skipinitialspace=True) df['thing_property'] = df['thing'] + df['property'] mdm_list = sorted(list((set(df['thing_property'])))) # %% def compute_norm_leven(string1, string2): max_distance = max(len(string1), len(string2)) leven_distance = Levenshtein.distance(string1, string2) norm_leven = leven_distance / max_distance return norm_leven def compute_avg_score(strings): n = len(strings) # if group only has 1 string, then it is fully similar to itself if n == 1: return 0 # Create an empty matrix distance_matrix = np.zeros((n, n), dtype=float) # Fill only the upper triangular part for i in range(n): for j in range(i + 1, n): dist = compute_norm_leven(strings[i], strings[j]) distance_matrix[i, j] = dist upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)] mean_distance = np.mean(upper_triangular_distances) return mean_distance # %% # we want to subset to each class n = len(mdm_list) score_list = np.zeros((n), dtype=float) for i in range(n): df_subset = df[df['thing_property'] == mdm_list[i]] strings = df_subset['tag_description'].to_numpy() score_list[i] = compute_avg_score(strings) # %% score_list # %% # plt.hist(score_list, bins=50) plt.figure(figsize=(8, 6)) plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7) plt.xlabel("Normalized Levenshtein Distance") plt.ylabel("Platform Domain Class Count") plt.tight_layout() plt.savefig("histogram.png", dpi=300) # %% # summary statistics of computed levenshtein distance def summary_stats(arr): return { "Mean": np.mean(arr), "Median": np.median(arr), "Standard Deviation": np.std(arr), "Variance": np.var(arr), "Min": np.min(arr), "Max": np.max(arr), "Range": np.ptp(arr), "25th Percentile": np.percentile(arr, 25), "75th Percentile": np.percentile(arr, 75), "Sum": np.sum(arr), } stats = summary_stats(score_list) for key, value in stats.items(): print(f"{key}: {value}") # %%