59 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			59 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
| # %%
 | |
| import pandas as pd
 | |
| import numpy as np
 | |
| import matplotlib.pyplot as plt
 | |
| 
 | |
| # %%
 | |
| # data_path = '../../data_import/exports/raw_data.csv'
 | |
| data_path = '../../data_preprocess/exports/preprocessed_data.csv'
 | |
| df = pd.read_csv(data_path)
 | |
| 
 | |
| # %%
 | |
| df = df[df['MDM']].reset_index(drop=True)
 | |
| 
 | |
| # %%
 | |
| # we want to print the string length
 | |
| 
 | |
| # print summary stats
 | |
| def summary_stats(arr):
 | |
|     return {
 | |
|         "Mean": np.mean(arr),
 | |
|         "Median": np.median(arr),
 | |
|         "Standard Deviation": np.std(arr),
 | |
|         "Variance": np.var(arr),
 | |
|         "Min": np.min(arr),
 | |
|         "Max": np.max(arr),
 | |
|         "Range": np.ptp(arr),
 | |
|         "25th Percentile": np.percentile(arr, 25),
 | |
|         "75th Percentile": np.percentile(arr, 75),
 | |
|         "Sum": np.sum(arr),
 | |
|     }
 | |
| 
 | |
| # %%
 | |
| ship_domain_data = df['tag_description'] + df['unit'].fillna('')
 | |
| 
 | |
| ship_domain_array = np.array([len(item) for item in ship_domain_data])
 | |
| 
 | |
| stats = summary_stats(ship_domain_array)
 | |
| 
 | |
| for key, value in stats.items():
 | |
|     print(f"{key}: {value}")
 | |
| 
 | |
| 
 | |
| # %%
 | |
| plt.hist(ship_domain_array, bins=50)
 | |
| # %%
 | |
| 
 | |
| # %%
 | |
| platform_domain_data = df['thing'] + df['property']
 | |
| 
 | |
| platform_domain_array = np.array([len(item) for item in platform_domain_data])
 | |
| 
 | |
| stats = summary_stats(platform_domain_array)
 | |
| 
 | |
| for key, value in stats.items():
 | |
|     print(f"{key}: {value}")
 | |
| 
 | |
| 
 | |
| # %%
 |