89 lines
2.3 KiB
Python
89 lines
2.3 KiB
Python
# %%
|
|
import pandas as pd
|
|
import Levenshtein
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from tqdm import tqdm
|
|
|
|
|
|
# %%
|
|
data_path = '../../data_import/exports/data_mapping_mdm.csv'
|
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
|
df['thing_property'] = df['thing'] + df['property']
|
|
mdm_list = sorted(list((set(df['thing_property']))))
|
|
|
|
|
|
# %%
|
|
def compute_norm_leven(string1, string2):
|
|
max_distance = max(len(string1), len(string2))
|
|
leven_distance = Levenshtein.distance(string1, string2)
|
|
norm_leven = leven_distance / max_distance
|
|
return norm_leven
|
|
|
|
def compute_avg_score(strings):
|
|
n = len(strings)
|
|
|
|
# if group only has 1 string, then it is fully similar to itself
|
|
if n == 1:
|
|
return 0
|
|
|
|
# Create an empty matrix
|
|
distance_matrix = np.zeros((n, n), dtype=float)
|
|
|
|
# Fill only the upper triangular part
|
|
for i in range(n):
|
|
for j in range(i + 1, n):
|
|
dist = compute_norm_leven(strings[i], strings[j])
|
|
distance_matrix[i, j] = dist
|
|
|
|
upper_triangular_distances = distance_matrix[np.triu_indices(n, k=1)]
|
|
mean_distance = np.mean(upper_triangular_distances)
|
|
return mean_distance
|
|
|
|
|
|
# %%
|
|
# we want to subset to each class
|
|
n = len(mdm_list)
|
|
score_list = np.zeros((n), dtype=float)
|
|
|
|
for i in range(n):
|
|
df_subset = df[df['thing_property'] == mdm_list[i]]
|
|
strings = df_subset['tag_description'].to_numpy()
|
|
score_list[i] = compute_avg_score(strings)
|
|
|
|
|
|
# %%
|
|
score_list
|
|
|
|
|
|
# %%
|
|
# plt.hist(score_list, bins=50)
|
|
plt.figure(figsize=(8, 6))
|
|
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
|
plt.xlabel("Normalized Levenshtein Distance")
|
|
plt.ylabel("Platform Domain Class Count")
|
|
plt.tight_layout()
|
|
plt.savefig("histogram.png", dpi=300)
|
|
# %%
|
|
# summary statistics of computed levenshtein distance
|
|
def summary_stats(arr):
|
|
return {
|
|
"Mean": np.mean(arr),
|
|
"Median": np.median(arr),
|
|
"Standard Deviation": np.std(arr),
|
|
"Variance": np.var(arr),
|
|
"Min": np.min(arr),
|
|
"Max": np.max(arr),
|
|
"Range": np.ptp(arr),
|
|
"25th Percentile": np.percentile(arr, 25),
|
|
"75th Percentile": np.percentile(arr, 75),
|
|
"Sum": np.sum(arr),
|
|
}
|
|
|
|
stats = summary_stats(score_list)
|
|
|
|
for key, value in stats.items():
|
|
print(f"{key}: {value}")
|
|
|
|
# %%
|