hipom_data_mapping/analysis/data_properties/character_count.py

59 lines
1.3 KiB
Python

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %%
# data_path = '../../data_import/exports/raw_data.csv'
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
df = pd.read_csv(data_path)
# %%
df = df[df['MDM']].reset_index(drop=True)
# %%
# we want to print the string length
# print summary stats
def summary_stats(arr):
return {
"Mean": np.mean(arr),
"Median": np.median(arr),
"Standard Deviation": np.std(arr),
"Variance": np.var(arr),
"Min": np.min(arr),
"Max": np.max(arr),
"Range": np.ptp(arr),
"25th Percentile": np.percentile(arr, 25),
"75th Percentile": np.percentile(arr, 75),
"Sum": np.sum(arr),
}
# %%
ship_domain_data = df['tag_description'] + df['unit'].fillna('')
ship_domain_array = np.array([len(item) for item in ship_domain_data])
stats = summary_stats(ship_domain_array)
for key, value in stats.items():
print(f"{key}: {value}")
# %%
plt.hist(ship_domain_array, bins=50)
# %%
# %%
platform_domain_data = df['thing'] + df['property']
platform_domain_array = np.array([len(item) for item in platform_domain_data])
stats = summary_stats(platform_domain_array)
for key, value in stats.items():
print(f"{key}: {value}")
# %%