2025-01-13 19:05:13 +09:00
|
|
|
# %%
|
|
|
|
import pandas as pd
|
|
|
|
import matplotlib.pyplot as plt
|
2025-01-14 17:34:17 +09:00
|
|
|
import numpy as np
|
2025-01-13 19:05:13 +09:00
|
|
|
|
|
|
|
# %%
|
|
|
|
# import training file
|
|
|
|
data_path = '../data_import/train.csv'
|
|
|
|
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
|
|
|
|
|
|
|
|
|
|
|
# %%
|
|
|
|
id_counts = train_df['entity_id'].value_counts()
|
|
|
|
|
|
|
|
# %%
|
|
|
|
|
2025-01-14 17:34:17 +09:00
|
|
|
# %%
|
|
|
|
id_counts[:50]
|
|
|
|
|
|
|
|
# %%
|
|
|
|
|
2025-01-13 19:05:13 +09:00
|
|
|
plt.hist(id_counts, bins=50)
|
2025-01-14 17:34:17 +09:00
|
|
|
|
|
|
|
# %%
|
|
|
|
def compute_normalized_class_weights(class_counts, max_resamples=10):
|
|
|
|
"""
|
|
|
|
Compute normalized class weights inversely proportional to class counts.
|
|
|
|
The weights are normalized so that they sum to 1.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
class_counts (array-like): An array or list where each element represents the count of samples for a class.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
numpy.ndarray: A normalized array of weights for each class.
|
|
|
|
"""
|
|
|
|
class_counts = np.array(class_counts)
|
|
|
|
total_samples = np.sum(class_counts)
|
|
|
|
class_weights = total_samples / class_counts
|
|
|
|
# so that highest weight is 1
|
|
|
|
normalized_weights = class_weights / np.max(class_weights)
|
|
|
|
# Scale weights such that the highest weight corresponds to `max_resamples`
|
|
|
|
resample_counts = normalized_weights * max_resamples
|
|
|
|
# Round resamples to nearest integer
|
|
|
|
resample_counts = np.round(resample_counts).astype(int)
|
|
|
|
return resample_counts
|
|
|
|
|
|
|
|
# %%
|
|
|
|
id_weights = compute_normalized_class_weights(id_counts, max_resamples=10)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
id_weights
|
|
|
|
# %%
|
|
|
|
id_mask = train_df['entity_id'] == 536
|
|
|
|
train_df[id_mask]
|
|
|
|
|
|
|
|
# %%
|
|
|
|
id_counts.index.to_list()
|
2025-01-13 19:05:13 +09:00
|
|
|
# %%
|