hipom_data_mapping/analysis/string_levenshtein/within_same_class.py

89 lines
2.3 KiB
Python

# %%
import pandas as pd
import Levenshtein
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
# %%
data_path = '../../data_import/exports/data_mapping_mdm.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
df['thing_property'] = df['thing'] + df['property']
mdm_list = sorted(list((set(df['thing_property']))))
# %%
def compute_norm_leven(string1, string2):
max_distance = max(len(string1), len(string2))
leven_distance = Levenshtein.distance(string1, string2)
norm_leven = leven_distance / max_distance
return norm_leven
def compute_avg_score(strings):
n = len(strings)
# if group only has 1 string, then it is fully similar to itself
if n == 1:
return 0
# Create an empty matrix
distance_matrix = np.zeros((n, n), dtype=float)
# Fill only the upper triangular part
for i in range(n):
for j in range(i + 1, n):
dist = compute_norm_leven(strings[i], strings[j])
distance_matrix[i, j] = dist
upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
mean_distance = np.mean(upper_triangular_distances)
return mean_distance
# %%
# we want to subset to each class
n = len(mdm_list)
score_list = np.zeros((n), dtype=float)
for i in range(n):
df_subset = df[df['thing_property'] == mdm_list[i]]
strings = df_subset['tag_description'].to_numpy()
score_list[i] = compute_avg_score(strings)
# %%
score_list
# %%
# plt.hist(score_list, bins=50)
plt.figure(figsize=(8, 6))
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel("Normalized Levenshtein Distance")
plt.ylabel("Platform Domain Class Count")
plt.tight_layout()
plt.savefig("histogram.png", dpi=300)
# %%
# summary statistics of computed levenshtein distance
def summary_stats(arr):
return {
"Mean": np.mean(arr),
"Median": np.median(arr),
"Standard Deviation": np.std(arr),
"Variance": np.var(arr),
"Min": np.min(arr),
"Max": np.max(arr),
"Range": np.ptp(arr),
"25th Percentile": np.percentile(arr, 25),
"75th Percentile": np.percentile(arr, 75),
"Sum": np.sum(arr),
}
stats = summary_stats(score_list)
for key, value in stats.items():
print(f"{key}: {value}")
# %%