# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.16.4 # kernelspec: # display_name: torch # language: python # name: python3 # --- # %% import pandas as pd from collections import defaultdict # Function to calculate the number of unique combinations and total count for each ship def calculate_ship_count(group): ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index() ship_count.columns = ['ships_idx', 'comb_count', 'total_count'] return ship_count # Function to calculate the combination count and total count for a group def calculate_group_count(group): comb_count = group['thing_property'].nunique() total_count = group['thing_property'].size return comb_count, total_count # Function to calculate the increase in combination count when a ship is added to a group def calculate_comb_count_increase(groups, g, ship_idx, mdm): temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()}) temp_groups[g].append(ship_idx) group_ships = temp_groups[g] group_data = mdm[mdm['ships_idx'].isin(group_ships)] new_comb_count, _ = calculate_group_count(group_data) current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] current_comb_count, _ = calculate_group_count(current_group_data) increase = new_comb_count - current_comb_count return increase # Function to calculate the increase in total count when a ship is added to a group def calculate_total_count_increase(groups, g, ship_idx, mdm): temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()}) temp_groups[g].append(ship_idx) group_ships = temp_groups[g] group_data = mdm[mdm['ships_idx'].isin(group_ships)] _, new_total_count = calculate_group_count(group_data) current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] _, current_total_count = calculate_group_count(current_group_data) increase = new_total_count - current_total_count return increase # Function to find the ship that will bring the total count closest to the target def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count): total_count_differences = [] current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] _, current_total_count = calculate_group_count(current_group_data) for ship_idx in remaining_ships: increase = calculate_total_count_increase(groups, g, ship_idx, mdm) new_total_count = current_total_count + increase difference = abs(target_total_count - new_total_count) total_count_differences.append((ship_idx, difference, increase)) if not total_count_differences: return None, 0 closest_ship = min(total_count_differences, key=lambda x: x[1]) selected_ship_idx, _, selected_increase = closest_ship return selected_ship_idx, selected_increase # Function to find the ship that gives the maximum increase in combination count def find_max_increase_ship(groups, g, remaining_ships, mdm): comb_count_increase = [] for ship_idx in remaining_ships: increase = calculate_comb_count_increase(groups, g, ship_idx, mdm) comb_count_increase.append((ship_idx, increase)) max_increase_ship = max(comb_count_increase, key=lambda x: x[1]) selected_ship_idx, max_increase = max_increase_ship return selected_ship_idx, max_increase # Function to find the ship that will bring the combination count closest to the target def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count): comb_count_differences = [] current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] current_comb_count, _ = calculate_group_count(current_group_data) for ship_idx in remaining_ships: increase = calculate_comb_count_increase(groups, g, ship_idx, mdm) new_comb_count = current_comb_count + increase difference = abs(target_comb_count - new_comb_count) comb_count_differences.append((ship_idx, difference, increase)) if not comb_count_differences: return None, 0 closest_ship = min(comb_count_differences, key=lambda x: x[1]) selected_ship_idx, _, selected_increase = closest_ship return selected_ship_idx, selected_increase # Function to find the group with the maximum combination count def find_group_with_max_comb_count(groups, mdm): max_comb_count = -1 max_group_idx = -1 for g in range(len(groups)): group_ships = groups[g] group_data = mdm[mdm['ships_idx'].isin(group_ships)] comb_count, _ = calculate_group_count(group_data) if comb_count > max_comb_count: max_comb_count = comb_count max_group_idx = g return max_group_idx, max_comb_count # Function to find the group with the maximum total count def find_group_with_max_total_count(groups, mdm): max_total_count = -1 max_group_idx = -1 for g in range(len(groups)): group_ships = groups[g] group_data = mdm[mdm['ships_idx'].isin(group_ships)] _, total_count = calculate_group_count(group_data) if total_count > max_total_count: max_total_count = total_count max_group_idx = g return max_group_idx, max_total_count import pandas as pd from collections import defaultdict # Load the CSV file data_file_path = 'exports/preprocessed_data.csv' data = pd.read_csv(data_file_path) # Filter the data where MDM is True mdm_true = data[data['MDM']].copy() # .copy()를 사용하여 명시적으로 복사본 생성 mdm_all = data.copy() # Create a new column combining 'thing' and 'property' mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property'] mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property'] # Initial setup for groups ship_count = calculate_ship_count(mdm_true) num_groups = 5 groups = defaultdict(list) # Sort ships by combination count in descending order sorted_ships = ship_count.sort_values(by='comb_count', ascending=False) # Assign the first 5 ships to the groups for i in range(num_groups): groups[i].append(sorted_ships.iloc[i]['ships_idx']) remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values # Allocate remaining ships to the groups while len(remaining_ships) > 0: # re-compute the counts for each group group_comb_counts = [] for g in range(num_groups): group_ships = groups[g] group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)] comb_count, _ = calculate_group_count(group_data) group_comb_counts.append((g, comb_count)) group_comb_counts.sort(key=lambda x: x[1]) # reset the remaining_group list remaining_group = [] # g is the identifier for the group for g, _ in group_comb_counts: if len(remaining_ships) == 0: break # compute for each group, the selected ship, and the combined count increase if group_comb_counts.index((g, _)) == 0: selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true) else: max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true) selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count) # if the combined increase is 0, then we process it in a special manner if comb_increase == 0: remaining_group.append(g) else: groups[g].append(selected_ship_idx) remaining_ships = remaining_ships[remaining_ships != selected_ship_idx] for g in remaining_group: if len(remaining_ships) == 0: break max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true) selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count) if selected_ship_idx is not None: groups[g].append(selected_ship_idx) remaining_ships = remaining_ships[remaining_ships != selected_ship_idx] # Calculate comb_count for each group and store it in a list group_comb_counts = [] for g in range(num_groups): group_ships = groups[g] group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)] comb_count, total_count = calculate_group_count(group_data_true) # Calculate total count including MDM=False group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)] _, total_count_all = calculate_group_count(group_data_all) group_comb_counts.append((g, comb_count, total_count_all)) # Sort the groups by comb_count in descending order group_comb_counts.sort(key=lambda x: x[1], reverse=True) # Reorder the groups dictionary based on the sorted order sorted_groups = defaultdict(list) for i, (g, _, _) in enumerate(group_comb_counts): sorted_groups[i] = groups[g] # Final output of group allocation print("Final Group Allocation:") for g in range(num_groups): group_ships = sorted_groups[g] group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)] comb_count, total_count = calculate_group_count(group_data_true) # Calculate total count including MDM=False group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)] _, total_count_all = calculate_group_count(group_data_all) print(f"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}") # %% import pandas as pd from sklearn.model_selection import GroupKFold # Prepare data for custom group allocation (BGKF) comb_counts = [] total_counts = [] ship_counts = [] custom_results = [] for g in range(num_groups): group_ships = groups[g] group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)] comb_count, total_count = calculate_group_count(group_data_true) # Calculate total count including MDM=False group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)] _, total_count_all = calculate_group_count(group_data_all) custom_results.append({ 'Group': g + 1, 'Allocation': 'BGKF', 'Comb_count': comb_count, 'Total_count': total_count, 'Total_count_all': total_count_all, 'Ship_count': len(group_ships), 'Ships_idx': list(group_ships) }) # Sort the custom group allocation by comb_count in descending order custom_results.sort(key=lambda x: x['Comb_count'], reverse=True) # Adjust group numbers after sorting for i, result in enumerate(custom_results): result['Group'] = i + 1 # Prepare data for GroupKFold allocation (GKF) gkf = GroupKFold(n_splits=5) gkf_results = [] for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])): test_group = mdm_true.iloc[test_idx] comb_count, total_count = calculate_group_count(test_group) # Calculate total count including MDM=False test_group_ships = test_group['ships_idx'].unique() test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)] _, total_count_all = calculate_group_count(test_group_all) gkf_results.append({ 'Group': i + 1, 'Allocation': 'GKF', 'Comb_count': comb_count, 'Total_count': total_count, 'Total_count_all': total_count_all, 'Ship_count': test_group['ships_idx'].nunique(), 'Ships_idx': list(test_group['ships_idx'].unique()) }) # Sort the GKF allocation by comb_count in descending order gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True) # Adjust group numbers after sorting for i, result in enumerate(gkf_results): result['Group'] = i + 1 # Combine BGKF and GKF results into one DataFrame combined_results = custom_results + gkf_results combined_df = pd.DataFrame(combined_results) # Output the combined results to a single CSV file combined_df.to_csv('exports/combined_group_allocation.csv', index=False) print("CSV file has been generated: 'combined_group_allocation.csv'") # %% import os import pandas as pd from sklearn.model_selection import KFold def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_splits=4): for i in range(len(groups)): group_folder = os.path.join(output_dir, 'group' + '_' + str(i + 1)) os.makedirs(group_folder, exist_ok=True) # Create the test dataset by including only group i test_group_ships = groups[i] # test_data = mdm[mdm['ships_idx'].isin(test_group_ships)] # Extract corresponding entries from the external test dataset test_all_data = data[data['ships_idx'].isin(test_group_ships)] # Create the train dataset by excluding group i train_group_ships = [] for g in range(len(groups)): if g != i: train_group_ships.extend(groups[g]) train_data = mdm[mdm['ships_idx'].isin(train_group_ships)] # Use KFold to split train_data into train and valid datasets kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42) train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data)) final_train_data = train_data.iloc[train_idx_inner] valid_data = train_data.iloc[valid_idx_inner] # Combine train and valid data to create train_all train_all_data = pd.concat([final_train_data, valid_data]) # Save datasets to CSV files # train.csv: mdm training set # valid.csv: mdm validation set # test.csv: mdm test set # test_all.csv: all test set with non-mdm # train_all.csv: all train set with non-mdm train_file_path = os.path.join(group_folder, 'train.csv') valid_file_path = os.path.join(group_folder, 'valid.csv') # test_file_path = os.path.join(group_folder, 'test.csv') test_all_file_path = os.path.join(group_folder, 'test_all.csv') train_all_file_path = os.path.join(group_folder, 'train_all.csv') final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig') valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig') # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig') test_all_data.to_csv(test_all_file_path, index=False, encoding='utf-8-sig') train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig') print(f"Group {i + 1} datasets saved in {group_folder}") # Example usage: save_datasets_for_group(groups, mdm_true, data, output_dir='exports/dataset', n_splits=4)