59 lines
1.3 KiB
Python
59 lines
1.3 KiB
Python
|
# %%
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
# %%
|
||
|
# data_path = '../../data_import/exports/raw_data.csv'
|
||
|
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
|
||
|
df = pd.read_csv(data_path)
|
||
|
|
||
|
# %%
|
||
|
df = df[df['MDM']].reset_index(drop=True)
|
||
|
|
||
|
# %%
|
||
|
# we want to print the string length
|
||
|
|
||
|
# print summary stats
|
||
|
def summary_stats(arr):
|
||
|
return {
|
||
|
"Mean": np.mean(arr),
|
||
|
"Median": np.median(arr),
|
||
|
"Standard Deviation": np.std(arr),
|
||
|
"Variance": np.var(arr),
|
||
|
"Min": np.min(arr),
|
||
|
"Max": np.max(arr),
|
||
|
"Range": np.ptp(arr),
|
||
|
"25th Percentile": np.percentile(arr, 25),
|
||
|
"75th Percentile": np.percentile(arr, 75),
|
||
|
"Sum": np.sum(arr),
|
||
|
}
|
||
|
|
||
|
# %%
|
||
|
ship_domain_data = df['tag_description'] + df['unit'].fillna('')
|
||
|
|
||
|
ship_domain_array = np.array([len(item) for item in ship_domain_data])
|
||
|
|
||
|
stats = summary_stats(ship_domain_array)
|
||
|
|
||
|
for key, value in stats.items():
|
||
|
print(f"{key}: {value}")
|
||
|
|
||
|
|
||
|
# %%
|
||
|
plt.hist(ship_domain_array, bins=50)
|
||
|
# %%
|
||
|
|
||
|
# %%
|
||
|
platform_domain_data = df['thing'] + df['property']
|
||
|
|
||
|
platform_domain_array = np.array([len(item) for item in platform_domain_data])
|
||
|
|
||
|
stats = summary_stats(platform_domain_array)
|
||
|
|
||
|
for key, value in stats.items():
|
||
|
print(f"{key}: {value}")
|
||
|
|
||
|
|
||
|
# %%
|