392 lines
15 KiB
Python
392 lines
15 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:percent
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.16.4
|
|
# kernelspec:
|
|
# display_name: torch
|
|
# language: python
|
|
# name: python3
|
|
# ---
|
|
|
|
# %%
|
|
import pandas as pd
|
|
from collections import defaultdict
|
|
|
|
# Function to calculate the number of unique combinations and total count for each ship
|
|
def calculate_ship_count(group):
|
|
ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index()
|
|
ship_count.columns = ['ships_idx', 'comb_count', 'total_count']
|
|
return ship_count
|
|
|
|
# Function to calculate the combination count and total count for a group
|
|
def calculate_group_count(group):
|
|
comb_count = group['thing_property'].nunique()
|
|
total_count = group['thing_property'].size
|
|
return comb_count, total_count
|
|
|
|
# Function to calculate the increase in combination count when a ship is added to a group
|
|
def calculate_comb_count_increase(groups, g, ship_idx, mdm):
|
|
temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})
|
|
temp_groups[g].append(ship_idx)
|
|
|
|
group_ships = temp_groups[g]
|
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
|
|
|
new_comb_count, _ = calculate_group_count(group_data)
|
|
|
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
|
current_comb_count, _ = calculate_group_count(current_group_data)
|
|
|
|
increase = new_comb_count - current_comb_count
|
|
|
|
return increase
|
|
|
|
# Function to calculate the increase in total count when a ship is added to a group
|
|
def calculate_total_count_increase(groups, g, ship_idx, mdm):
|
|
temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})
|
|
temp_groups[g].append(ship_idx)
|
|
|
|
group_ships = temp_groups[g]
|
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
|
|
|
_, new_total_count = calculate_group_count(group_data)
|
|
|
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
|
_, current_total_count = calculate_group_count(current_group_data)
|
|
|
|
increase = new_total_count - current_total_count
|
|
|
|
return increase
|
|
|
|
# Function to find the ship that will bring the total count closest to the target
|
|
def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count):
|
|
total_count_differences = []
|
|
|
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
|
_, current_total_count = calculate_group_count(current_group_data)
|
|
|
|
for ship_idx in remaining_ships:
|
|
increase = calculate_total_count_increase(groups, g, ship_idx, mdm)
|
|
new_total_count = current_total_count + increase
|
|
difference = abs(target_total_count - new_total_count)
|
|
total_count_differences.append((ship_idx, difference, increase))
|
|
|
|
if not total_count_differences:
|
|
return None, 0
|
|
|
|
closest_ship = min(total_count_differences, key=lambda x: x[1])
|
|
selected_ship_idx, _, selected_increase = closest_ship
|
|
|
|
return selected_ship_idx, selected_increase
|
|
|
|
# Function to find the ship that gives the maximum increase in combination count
|
|
def find_max_increase_ship(groups, g, remaining_ships, mdm):
|
|
comb_count_increase = []
|
|
|
|
for ship_idx in remaining_ships:
|
|
increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)
|
|
comb_count_increase.append((ship_idx, increase))
|
|
|
|
max_increase_ship = max(comb_count_increase, key=lambda x: x[1])
|
|
selected_ship_idx, max_increase = max_increase_ship
|
|
|
|
return selected_ship_idx, max_increase
|
|
|
|
# Function to find the ship that will bring the combination count closest to the target
|
|
def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count):
|
|
comb_count_differences = []
|
|
|
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
|
current_comb_count, _ = calculate_group_count(current_group_data)
|
|
|
|
for ship_idx in remaining_ships:
|
|
increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)
|
|
new_comb_count = current_comb_count + increase
|
|
difference = abs(target_comb_count - new_comb_count)
|
|
comb_count_differences.append((ship_idx, difference, increase))
|
|
|
|
if not comb_count_differences:
|
|
return None, 0
|
|
|
|
closest_ship = min(comb_count_differences, key=lambda x: x[1])
|
|
selected_ship_idx, _, selected_increase = closest_ship
|
|
|
|
return selected_ship_idx, selected_increase
|
|
|
|
# Function to find the group with the maximum combination count
|
|
def find_group_with_max_comb_count(groups, mdm):
|
|
max_comb_count = -1
|
|
max_group_idx = -1
|
|
|
|
for g in range(len(groups)):
|
|
group_ships = groups[g]
|
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
|
comb_count, _ = calculate_group_count(group_data)
|
|
|
|
if comb_count > max_comb_count:
|
|
max_comb_count = comb_count
|
|
max_group_idx = g
|
|
|
|
return max_group_idx, max_comb_count
|
|
|
|
# Function to find the group with the maximum total count
|
|
def find_group_with_max_total_count(groups, mdm):
|
|
max_total_count = -1
|
|
max_group_idx = -1
|
|
|
|
for g in range(len(groups)):
|
|
group_ships = groups[g]
|
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
|
_, total_count = calculate_group_count(group_data)
|
|
|
|
if total_count > max_total_count:
|
|
max_total_count = total_count
|
|
max_group_idx = g
|
|
|
|
return max_group_idx, max_total_count
|
|
|
|
import pandas as pd
|
|
from collections import defaultdict
|
|
|
|
# Load the CSV file
|
|
data_file_path = 'exports/preprocessed_data.csv'
|
|
data = pd.read_csv(data_file_path)
|
|
|
|
# Filter the data where MDM is True
|
|
mdm_true = data[data['MDM']].copy() # .copy()를 사용하여 명시적으로 복사본 생성
|
|
mdm_all = data.copy()
|
|
|
|
# Create a new column combining 'thing' and 'property'
|
|
mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property']
|
|
mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property']
|
|
|
|
# Initial setup for groups
|
|
ship_count = calculate_ship_count(mdm_true)
|
|
num_groups = 5
|
|
groups = defaultdict(list)
|
|
|
|
# Sort ships by combination count in descending order
|
|
sorted_ships = ship_count.sort_values(by='comb_count', ascending=False)
|
|
|
|
# Assign the first 5 ships to the groups
|
|
for i in range(num_groups):
|
|
groups[i].append(sorted_ships.iloc[i]['ships_idx'])
|
|
|
|
remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
|
|
|
|
# Allocate remaining ships to the groups
|
|
while len(remaining_ships) > 0:
|
|
# re-compute the counts for each group
|
|
group_comb_counts = []
|
|
for g in range(num_groups):
|
|
group_ships = groups[g]
|
|
group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
|
comb_count, _ = calculate_group_count(group_data)
|
|
group_comb_counts.append((g, comb_count))
|
|
|
|
group_comb_counts.sort(key=lambda x: x[1])
|
|
|
|
# reset the remaining_group list
|
|
remaining_group = []
|
|
# g is the identifier for the group
|
|
for g, _ in group_comb_counts:
|
|
if len(remaining_ships) == 0:
|
|
break
|
|
|
|
# compute for each group, the selected ship, and the combined count increase
|
|
if group_comb_counts.index((g, _)) == 0:
|
|
selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
|
|
else:
|
|
max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
|
|
selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
|
|
|
|
# if the combined increase is 0, then we process it in a special manner
|
|
if comb_increase == 0:
|
|
remaining_group.append(g)
|
|
else:
|
|
groups[g].append(selected_ship_idx)
|
|
remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]
|
|
|
|
for g in remaining_group:
|
|
if len(remaining_ships) == 0:
|
|
break
|
|
max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true)
|
|
selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count)
|
|
if selected_ship_idx is not None:
|
|
groups[g].append(selected_ship_idx)
|
|
remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]
|
|
|
|
# Calculate comb_count for each group and store it in a list
|
|
group_comb_counts = []
|
|
for g in range(num_groups):
|
|
group_ships = groups[g]
|
|
group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
|
comb_count, total_count = calculate_group_count(group_data_true)
|
|
|
|
# Calculate total count including MDM=False
|
|
group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]
|
|
_, total_count_all = calculate_group_count(group_data_all)
|
|
|
|
group_comb_counts.append((g, comb_count, total_count_all))
|
|
|
|
# Sort the groups by comb_count in descending order
|
|
group_comb_counts.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Reorder the groups dictionary based on the sorted order
|
|
sorted_groups = defaultdict(list)
|
|
for i, (g, _, _) in enumerate(group_comb_counts):
|
|
sorted_groups[i] = groups[g]
|
|
|
|
# Final output of group allocation
|
|
print("Final Group Allocation:")
|
|
for g in range(num_groups):
|
|
group_ships = sorted_groups[g]
|
|
group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
|
comb_count, total_count = calculate_group_count(group_data_true)
|
|
|
|
# Calculate total count including MDM=False
|
|
group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]
|
|
_, total_count_all = calculate_group_count(group_data_all)
|
|
|
|
print(f"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}")
|
|
|
|
|
|
# %%
|
|
import pandas as pd
|
|
from sklearn.model_selection import GroupKFold
|
|
|
|
# Prepare data for custom group allocation (BGKF)
|
|
comb_counts = []
|
|
total_counts = []
|
|
ship_counts = []
|
|
custom_results = []
|
|
|
|
for g in range(num_groups):
|
|
group_ships = groups[g]
|
|
group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
|
comb_count, total_count = calculate_group_count(group_data_true)
|
|
|
|
# Calculate total count including MDM=False
|
|
group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]
|
|
_, total_count_all = calculate_group_count(group_data_all)
|
|
|
|
custom_results.append({
|
|
'Group': g + 1,
|
|
'Allocation': 'BGKF',
|
|
'Comb_count': comb_count,
|
|
'Total_count': total_count,
|
|
'Total_count_all': total_count_all,
|
|
'Ship_count': len(group_ships),
|
|
'Ships_idx': list(group_ships)
|
|
})
|
|
|
|
# Sort the custom group allocation by comb_count in descending order
|
|
custom_results.sort(key=lambda x: x['Comb_count'], reverse=True)
|
|
|
|
# Adjust group numbers after sorting
|
|
for i, result in enumerate(custom_results):
|
|
result['Group'] = i + 1
|
|
|
|
# Prepare data for GroupKFold allocation (GKF)
|
|
gkf = GroupKFold(n_splits=5)
|
|
gkf_results = []
|
|
|
|
for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])):
|
|
test_group = mdm_true.iloc[test_idx]
|
|
comb_count, total_count = calculate_group_count(test_group)
|
|
|
|
# Calculate total count including MDM=False
|
|
test_group_ships = test_group['ships_idx'].unique()
|
|
test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)]
|
|
_, total_count_all = calculate_group_count(test_group_all)
|
|
|
|
gkf_results.append({
|
|
'Group': i + 1,
|
|
'Allocation': 'GKF',
|
|
'Comb_count': comb_count,
|
|
'Total_count': total_count,
|
|
'Total_count_all': total_count_all,
|
|
'Ship_count': test_group['ships_idx'].nunique(),
|
|
'Ships_idx': list(test_group['ships_idx'].unique())
|
|
})
|
|
|
|
# Sort the GKF allocation by comb_count in descending order
|
|
gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True)
|
|
|
|
# Adjust group numbers after sorting
|
|
for i, result in enumerate(gkf_results):
|
|
result['Group'] = i + 1
|
|
|
|
# Combine BGKF and GKF results into one DataFrame
|
|
combined_results = custom_results + gkf_results
|
|
combined_df = pd.DataFrame(combined_results)
|
|
|
|
# Output the combined results to a single CSV file
|
|
combined_df.to_csv('exports/combined_group_allocation.csv', index=False)
|
|
|
|
print("CSV file has been generated: 'combined_group_allocation.csv'")
|
|
|
|
|
|
# %%
|
|
import os
|
|
import pandas as pd
|
|
from sklearn.model_selection import KFold
|
|
|
|
def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_splits=4):
|
|
for i in range(len(groups)):
|
|
group_folder = os.path.join(output_dir, 'group' + '_' + str(i + 1))
|
|
os.makedirs(group_folder, exist_ok=True)
|
|
|
|
# Create the test dataset by including only group i
|
|
test_group_ships = groups[i]
|
|
# test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]
|
|
|
|
# Extract corresponding entries from the external test dataset
|
|
test_all_data = data[data['ships_idx'].isin(test_group_ships)]
|
|
|
|
# Create the train dataset by excluding group i
|
|
train_group_ships = []
|
|
for g in range(len(groups)):
|
|
if g != i:
|
|
train_group_ships.extend(groups[g])
|
|
train_data = mdm[mdm['ships_idx'].isin(train_group_ships)]
|
|
|
|
# Use KFold to split train_data into train and valid datasets
|
|
kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42)
|
|
train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data))
|
|
|
|
final_train_data = train_data.iloc[train_idx_inner]
|
|
valid_data = train_data.iloc[valid_idx_inner]
|
|
|
|
# Combine train and valid data to create train_all
|
|
train_all_data = pd.concat([final_train_data, valid_data])
|
|
|
|
# Save datasets to CSV files
|
|
# train.csv: mdm training set
|
|
# valid.csv: mdm validation set
|
|
# test.csv: mdm test set
|
|
# test_all.csv: all test set with non-mdm
|
|
# train_all.csv: all train set with non-mdm
|
|
train_file_path = os.path.join(group_folder, 'train.csv')
|
|
valid_file_path = os.path.join(group_folder, 'valid.csv')
|
|
# test_file_path = os.path.join(group_folder, 'test.csv')
|
|
test_all_file_path = os.path.join(group_folder, 'test_all.csv')
|
|
train_all_file_path = os.path.join(group_folder, 'train_all.csv')
|
|
|
|
final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')
|
|
valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')
|
|
# test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')
|
|
test_all_data.to_csv(test_all_file_path, index=False, encoding='utf-8-sig')
|
|
train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')
|
|
|
|
print(f"Group {i + 1} datasets saved in {group_folder}")
|
|
|
|
# Example usage:
|
|
save_datasets_for_group(groups, mdm_true, data, output_dir='exports/dataset', n_splits=4)
|
|
|