# %% import pandas as pd import numpy as np import matplotlib.pyplot as plt # %% # data_path = '../../data_import/exports/raw_data.csv' data_path = '../../data_preprocess/exports/preprocessed_data.csv' df = pd.read_csv(data_path) # %% df = df[df['MDM']].reset_index(drop=True) # %% # we want to print the string length # print summary stats def summary_stats(arr): return { "Mean": np.mean(arr), "Median": np.median(arr), "Standard Deviation": np.std(arr), "Variance": np.var(arr), "Min": np.min(arr), "Max": np.max(arr), "Range": np.ptp(arr), "25th Percentile": np.percentile(arr, 25), "75th Percentile": np.percentile(arr, 75), "Sum": np.sum(arr), } # %% ship_domain_data = df['tag_description'] + df['unit'].fillna('') ship_domain_array = np.array([len(item) for item in ship_domain_data]) stats = summary_stats(ship_domain_array) for key, value in stats.items(): print(f"{key}: {value}") # %% plt.hist(ship_domain_array, bins=50) # %% # %% platform_domain_data = df['thing'] + df['property'] platform_domain_array = np.array([len(item) for item in platform_domain_data]) stats = summary_stats(platform_domain_array) for key, value in stats.items(): print(f"{key}: {value}") # %%