# %% import pandas as pd import matplotlib.pyplot as plt import numpy as np # %% # import training file data_path = '../data_import/train.csv' train_df = pd.read_csv(data_path, skipinitialspace=True) # %% id_counts = train_df['entity_id'].value_counts() # %% # %% id_counts[:50] # %% plt.hist(id_counts, bins=50) # %% def compute_normalized_class_weights(class_counts, max_resamples=10): """ Compute normalized class weights inversely proportional to class counts. The weights are normalized so that they sum to 1. Args: class_counts (array-like): An array or list where each element represents the count of samples for a class. Returns: numpy.ndarray: A normalized array of weights for each class. """ class_counts = np.array(class_counts) total_samples = np.sum(class_counts) class_weights = total_samples / class_counts # so that highest weight is 1 normalized_weights = class_weights / np.max(class_weights) # Scale weights such that the highest weight corresponds to `max_resamples` resample_counts = normalized_weights * max_resamples # Round resamples to nearest integer resample_counts = np.round(resample_counts).astype(int) return resample_counts # %% id_weights = compute_normalized_class_weights(id_counts, max_resamples=10) # %% id_weights # %% id_mask = train_df['entity_id'] == 536 train_df[id_mask] # %% id_counts.index.to_list() # %%