Chore: changed ipynb to py files in the data_preprocess folder
Doc: added descriptions and instructions for the data_preprocess folder
This commit is contained in:
parent
67f3712ea6
commit
4715999005
|
@ -1,3 +1 @@
|
||||||
db_connection_info.txt
|
db_connection_info.txt
|
||||||
exports/*
|
|
||||||
outputs/*
|
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
*
|
||||||
|
!.gitignore
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
*
|
||||||
|
!.gitignore
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
*.ipynb
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Data Preprocess
|
||||||
|
|
||||||
|
## What is this folder
|
||||||
|
|
||||||
|
This folder contains the files for pre-processing.
|
||||||
|
|
||||||
|
We divide each processing method into their respective folders to modularize the
|
||||||
|
pre-processing methods. This helps to make it easier to test different methods
|
||||||
|
and reduce coupling between stages.
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
|
||||||
|
First, we apply the pre-processing by running code from the desired folder.
|
||||||
|
|
||||||
|
Using `no_preprocess` directory as an example:
|
||||||
|
|
||||||
|
- `cd no_preprocess`
|
||||||
|
- Follow the instructions found in the sub-directory
|
||||||
|
- After code execution, the processed file will be placed into
|
||||||
|
`exports/preprocessed_data.csv`
|
||||||
|
|
||||||
|
We then run the data split code to create our k-fold splits.
|
||||||
|
|
||||||
|
- `cd` back to the `data_preprocess` directory
|
||||||
|
- `python split_data.py`
|
||||||
|
|
||||||
|
You will now have the datasets in `exports/dataset/group_{1,2,3,4,5}`
|
|
@ -0,0 +1,2 @@
|
||||||
|
*
|
||||||
|
!.gitignore
|
|
@ -0,0 +1,7 @@
|
||||||
|
## Purpose:
|
||||||
|
|
||||||
|
Disables pre-processing
|
||||||
|
|
||||||
|
## Instructions:
|
||||||
|
|
||||||
|
- `python copy_raw_data.py`
|
|
@ -1,8 +1,8 @@
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
source_file = 'data_import/raw_data.csv'
|
source_file = '../../data_import/exports/raw_data.csv'
|
||||||
|
|
||||||
destination_file = 'data_preprocess/preprocessed_data.csv'
|
destination_file = '../exports/preprocessed_data.csv'
|
||||||
|
|
||||||
shutil.copy(source_file, destination_file)
|
shutil.copy(source_file, destination_file)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
*.ipynb
|
|
@ -1,133 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Changes made in ships_idx 1000: 251\n",
|
|
||||||
"Changes made in ships_idx 1001: 54\n",
|
|
||||||
"Changes made in ships_idx 1002: 46\n",
|
|
||||||
"Changes made in ships_idx 1003: 162\n",
|
|
||||||
"Changes made in ships_idx 1004: 8\n",
|
|
||||||
"Changes made in ships_idx 1005: 18\n",
|
|
||||||
"Changes made in ships_idx 1008: 22\n",
|
|
||||||
"Changes made in ships_idx 1009: 5\n",
|
|
||||||
"Changes made in ships_idx 1010: 131\n",
|
|
||||||
"Changes made in ships_idx 1011: 46\n",
|
|
||||||
"Changes made in ships_idx 1012: 2\n",
|
|
||||||
"Changes made in ships_idx 1013: 130\n",
|
|
||||||
"Changes made in ships_idx 1014: 46\n",
|
|
||||||
"Changes made in ships_idx 1015: 145\n",
|
|
||||||
"Changes made in ships_idx 1016: 191\n",
|
|
||||||
"Changes made in ships_idx 1017: 111\n",
|
|
||||||
"Changes made in ships_idx 1018: 680\n",
|
|
||||||
"Changes made in ships_idx 1019: 2\n",
|
|
||||||
"Changes made in ships_idx 1020: 10\n",
|
|
||||||
"Changes made in ships_idx 1021: 2\n",
|
|
||||||
"Changes made in ships_idx 1022: 7\n",
|
|
||||||
"Changes made in ships_idx 1023: 7\n",
|
|
||||||
"Changes made in ships_idx 1024: 136\n",
|
|
||||||
"Changes made in ships_idx 1025: 10\n",
|
|
||||||
"Changes made in ships_idx 1026: 6\n",
|
|
||||||
"Changes made in ships_idx 1027: 6\n",
|
|
||||||
"Changes made in ships_idx 1028: 6\n",
|
|
||||||
"Changes made in ships_idx 1029: 132\n",
|
|
||||||
"Changes made in ships_idx 1030: 86\n",
|
|
||||||
"Changes made in ships_idx 1031: 55\n",
|
|
||||||
"Changes made in ships_idx 1032: 225\n",
|
|
||||||
"Changes made in ships_idx 1033: 147\n",
|
|
||||||
"Changes made in ships_idx 1035: 132\n",
|
|
||||||
"Changes made in ships_idx 1036: 5\n",
|
|
||||||
"Changes made in ships_idx 1037: 3\n",
|
|
||||||
"Changes made in ships_idx 1038: 6\n",
|
|
||||||
"Changes made in ships_idx 1039: 232\n",
|
|
||||||
"Changes made in ships_idx 1042: 20\n",
|
|
||||||
"Changes made in ships_idx 1043: 154\n",
|
|
||||||
"Changes made in ships_idx 1044: 117\n",
|
|
||||||
"Changes made in ships_idx 1045: 243\n",
|
|
||||||
"Changes made in ships_idx 1046: 6\n",
|
|
||||||
"Changes made in ships_idx 1047: 12\n",
|
|
||||||
"Changes made in ships_idx 1048: 82\n",
|
|
||||||
"Changes made in ships_idx 1049: 912\n",
|
|
||||||
"Changes made in ships_idx 1050: 46\n",
|
|
||||||
"Changes made in ships_idx 1051: 57\n",
|
|
||||||
"Total number of changes made: 4912\n",
|
|
||||||
"Updated data saved to raw_data_add_tag.csv\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"\n",
|
|
||||||
"# Load the preprocessed data CSV file\n",
|
|
||||||
"file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n",
|
|
||||||
"data = pd.read_csv(file_path, dtype=str)\n",
|
|
||||||
"\n",
|
|
||||||
"# Initialize a counter for the total number of changes\n",
|
|
||||||
"total_changes = 0\n",
|
|
||||||
"\n",
|
|
||||||
"# Initialize a dictionary to count changes per ships_idx\n",
|
|
||||||
"ships_idx_changes = {}\n",
|
|
||||||
"\n",
|
|
||||||
"# Process each group by ships_idx\n",
|
|
||||||
"for ships_idx, group in data.groupby('ships_idx'):\n",
|
|
||||||
" # Find duplicated tag_descriptions within the group\n",
|
|
||||||
" duplicated_descriptions = group['tag_description'].duplicated(keep=False)\n",
|
|
||||||
" \n",
|
|
||||||
" # Count how many tag_descriptions are duplicated within this ships_idx\n",
|
|
||||||
" num_changes = duplicated_descriptions.sum()\n",
|
|
||||||
"\n",
|
|
||||||
" # If there are any duplicates\n",
|
|
||||||
" if num_changes > 0:\n",
|
|
||||||
" # Increment the total changes count\n",
|
|
||||||
" total_changes += num_changes\n",
|
|
||||||
" \n",
|
|
||||||
" # Record the number of changes for this ships_idx\n",
|
|
||||||
" ships_idx_changes[ships_idx] = num_changes\n",
|
|
||||||
"\n",
|
|
||||||
" # Apply the concatenation of tag_name to tag_description for duplicates\n",
|
|
||||||
" data.loc[duplicated_descriptions & (data['ships_idx'] == ships_idx), 'tag_description'] = \\\n",
|
|
||||||
" data['tag_name'] + ' ' + data['tag_description']\n",
|
|
||||||
"\n",
|
|
||||||
"# Output the changes per ships_idx\n",
|
|
||||||
"for ships_idx, count in ships_idx_changes.items():\n",
|
|
||||||
" print(f\"Changes made in ships_idx {ships_idx}: {count}\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Output the total number of changes\n",
|
|
||||||
"print(f\"Total number of changes made: {total_changes}\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Optionally, save the updated DataFrame back to a CSV\n",
|
|
||||||
"output_file_path = 'raw_data_add_tag.csv'\n",
|
|
||||||
"data.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Updated data saved to {output_file_path}\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "torch",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.14"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
|
@ -1,100 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Updated data saved to raw_data_s.csv\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import re\n",
|
|
||||||
"\n",
|
|
||||||
"# Load the data_mapping CSV file\n",
|
|
||||||
"data_mapping_file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n",
|
|
||||||
"data_mapping_file_path = 'raw_data_add_tag.csv' # Adjust this path to your actual file location\n",
|
|
||||||
"data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
|
|
||||||
"\n",
|
|
||||||
"# Backup the original tag_description\n",
|
|
||||||
"data_mapping['org_tag_description'] = data_mapping['tag_description']\n",
|
|
||||||
"\n",
|
|
||||||
"# Ensure all values in the 'tag_description' column are strings\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to find tokens containing numbers\n",
|
|
||||||
"def find_tokens_with_numbers(description):\n",
|
|
||||||
" tokens = description.split() # Tokenize by spaces\n",
|
|
||||||
" number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n",
|
|
||||||
" return number_tokens\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to process tokens\n",
|
|
||||||
"def process_token(token):\n",
|
|
||||||
" # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n",
|
|
||||||
" token = re.sub(r'(_|-)(?=\\d)', ' ', token)\n",
|
|
||||||
" token = re.sub(r'(?<=\\d)(_|-)', ' ', token)\n",
|
|
||||||
"\n",
|
|
||||||
" # Step 2: Insert spaces between letters and numbers where no separator exists\n",
|
|
||||||
" token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n",
|
|
||||||
" token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n",
|
|
||||||
"\n",
|
|
||||||
" # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n",
|
|
||||||
" token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n",
|
|
||||||
"\n",
|
|
||||||
" # Clean multiple spaces and strip\n",
|
|
||||||
" token = re.sub(r'\\s+', ' ', token).strip()\n",
|
|
||||||
" return token\n",
|
|
||||||
"\n",
|
|
||||||
"# Apply the process to each row in the 'tag_description' column\n",
|
|
||||||
"for index, row in data_mapping.iterrows():\n",
|
|
||||||
" original_description = row['tag_description']\n",
|
|
||||||
" number_tokens = find_tokens_with_numbers(original_description)\n",
|
|
||||||
"\n",
|
|
||||||
" # Process each token containing numbers\n",
|
|
||||||
" processed_tokens = [process_token(token) for token in number_tokens]\n",
|
|
||||||
"\n",
|
|
||||||
" # Replace the original tokens with processed tokens in the tag_description\n",
|
|
||||||
" new_description = original_description\n",
|
|
||||||
" for original_token, processed_token in zip(number_tokens, processed_tokens):\n",
|
|
||||||
" new_description = new_description.replace(original_token, processed_token)\n",
|
|
||||||
"\n",
|
|
||||||
" # Update the data_mapping with the modified description\n",
|
|
||||||
" data_mapping.at[index, 'tag_description'] = new_description\n",
|
|
||||||
"\n",
|
|
||||||
"# Save the updated data_mapping to a new CSV file\n",
|
|
||||||
"output_file_path = 'raw_data_s.csv'\n",
|
|
||||||
"data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Updated data saved to {output_file_path}\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "torch",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.14"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
|
@ -1,123 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Updated data saved to ../preprocessed_data.csv\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import re\n",
|
|
||||||
"\n",
|
|
||||||
"# Load the data_mapping CSV file\n",
|
|
||||||
"data_mapping_file_path = 'raw_data_s.csv' # Adjust this path to your actual file location\n",
|
|
||||||
"data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
|
|
||||||
" \n",
|
|
||||||
" # Ensure all values in the 'tag_description' column are strings\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# Initial replacement mapping\n",
|
|
||||||
"initial_replacements = {\n",
|
|
||||||
" \"MGE\": \"G/E\",\n",
|
|
||||||
" \"GEN.\": \"G/E\",\n",
|
|
||||||
" \"GEN\": \"G/E\",\n",
|
|
||||||
" \"GE\": \"G/E\",\n",
|
|
||||||
" \"G_E\": \"G/E\",\n",
|
|
||||||
" \"ME\": \"M/E\",\n",
|
|
||||||
" \"M_E\": \"M/E\",\n",
|
|
||||||
" \"S_G\": \"S/G\",\n",
|
|
||||||
" \"T_C\": \"T/C\",\n",
|
|
||||||
" \"TC\": \"T/C\",\n",
|
|
||||||
" \"L_O\": \"L.O\",\n",
|
|
||||||
" \"LO\": \"L.O\",\n",
|
|
||||||
" \"F_O\": \"F.O\",\n",
|
|
||||||
" \"FO\": \"F.O\",\n",
|
|
||||||
" \"D_G\": \"D/G\",\n",
|
|
||||||
" \"DG\": \"D/G\",\n",
|
|
||||||
" \"PP\": \"P/P\"\n",
|
|
||||||
"}\n",
|
|
||||||
"\n",
|
|
||||||
"# Second replacement mapping\n",
|
|
||||||
"second_replacements = {\n",
|
|
||||||
" \"_G/E\": \" G/E\",\n",
|
|
||||||
" \"G/E_\": \"G/E \",\n",
|
|
||||||
" \"_M/E\": \" M/E\",\n",
|
|
||||||
" \"M/E_\": \"M/E \",\n",
|
|
||||||
" \"_S/G\": \" S/G\",\n",
|
|
||||||
" \"S/G_\": \"S/G \",\n",
|
|
||||||
" \"_T/C\": \" T/C\",\n",
|
|
||||||
" \"T/C_\": \"T/C \",\n",
|
|
||||||
" \"_L.O\": \" L.O\",\n",
|
|
||||||
" \"L.O_\": \"L.O \",\n",
|
|
||||||
" \"_F.O\": \" F.O\",\n",
|
|
||||||
" \"F.O_\": \"F.O \",\n",
|
|
||||||
" \"_D/G\": \" D/G\",\n",
|
|
||||||
" \"D/G_\": \"D/G \",\n",
|
|
||||||
" \"DG_\": \"DG \"\n",
|
|
||||||
"}\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to separate numbers from text in a token\n",
|
|
||||||
"def separate_numbers_from_text(description):\n",
|
|
||||||
" # This regex pattern finds occurrences where text is followed by numbers or vice versa\n",
|
|
||||||
" return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to perform replacements using tokens\n",
|
|
||||||
"def replace_tokens(description, replacements):\n",
|
|
||||||
" tokens = description.split() # Tokenize by spaces\n",
|
|
||||||
" tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary\n",
|
|
||||||
" return ' '.join(tokens)\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to perform replacements for substrings\n",
|
|
||||||
"def replace_substrings(description, replacements):\n",
|
|
||||||
" for old, new in replacements.items():\n",
|
|
||||||
" description = description.replace(old, new)\n",
|
|
||||||
" return description\n",
|
|
||||||
"\n",
|
|
||||||
"# Separate numbers from text before applying replacements\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n",
|
|
||||||
"\n",
|
|
||||||
"# Apply initial replacements\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n",
|
|
||||||
"\n",
|
|
||||||
"# Apply second replacements as substrings\n",
|
|
||||||
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n",
|
|
||||||
"\n",
|
|
||||||
"# Save the updated data_mapping to a new CSV file\n",
|
|
||||||
"output_file_path = '../preprocessed_data.csv'\n",
|
|
||||||
"data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Updated data saved to {output_file_path}\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "torch",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.14"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
## Purpose:
|
||||||
|
|
||||||
|
Here is a collection of pre-processing methods used in the GRS paper.
|
||||||
|
|
||||||
|
## Instructions:
|
||||||
|
|
||||||
|
- `python add_tag_name.py`
|
||||||
|
- `python separate_number.py`
|
||||||
|
- `python replacement.py`
|
|
@ -0,0 +1,63 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.16.4
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: torch
|
||||||
|
# language: python
|
||||||
|
# name: python3
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Load the preprocessed data CSV file
|
||||||
|
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||||||
|
data = pd.read_csv(file_path, dtype=str)
|
||||||
|
|
||||||
|
# Initialize a counter for the total number of changes
|
||||||
|
total_changes = 0
|
||||||
|
|
||||||
|
# Initialize a dictionary to count changes per ships_idx
|
||||||
|
ships_idx_changes = {}
|
||||||
|
|
||||||
|
# Process each group by ships_idx
|
||||||
|
for ships_idx, group in data.groupby('ships_idx'):
|
||||||
|
# Find duplicated tag_descriptions within the group
|
||||||
|
duplicated_descriptions = group['tag_description'].duplicated(keep=False)
|
||||||
|
|
||||||
|
# Count how many tag_descriptions are duplicated within this ships_idx
|
||||||
|
num_changes = duplicated_descriptions.sum()
|
||||||
|
|
||||||
|
# If there are any duplicates
|
||||||
|
if num_changes > 0:
|
||||||
|
# Increment the total changes count
|
||||||
|
total_changes += num_changes
|
||||||
|
|
||||||
|
# Record the number of changes for this ships_idx
|
||||||
|
ships_idx_changes[ships_idx] = num_changes
|
||||||
|
|
||||||
|
# Apply the concatenation of tag_name to tag_description for duplicates
|
||||||
|
data.loc[duplicated_descriptions & (data['ships_idx'] == ships_idx), 'tag_description'] = \
|
||||||
|
data['tag_name'] + ' ' + data['tag_description']
|
||||||
|
|
||||||
|
# Output the changes per ships_idx
|
||||||
|
for ships_idx, count in ships_idx_changes.items():
|
||||||
|
print(f"Changes made in ships_idx {ships_idx}: {count}")
|
||||||
|
|
||||||
|
# Output the total number of changes
|
||||||
|
print(f"Total number of changes made: {total_changes}")
|
||||||
|
|
||||||
|
# Optionally, save the updated DataFrame back to a CSV
|
||||||
|
output_file_path = 'outputs/raw_data_add_tag.csv'
|
||||||
|
data.to_csv(output_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
|
||||||
|
print(f"Updated data saved to {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,2 @@
|
||||||
|
*
|
||||||
|
!.gitignore
|
|
@ -0,0 +1,103 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.16.4
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: torch
|
||||||
|
# language: python
|
||||||
|
# name: python3
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Load the data_mapping CSV file
|
||||||
|
|
||||||
|
|
||||||
|
data_mapping_file_path = 'outputs/raw_data_s.csv' # Adjust this path to your actual file location
|
||||||
|
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
|
||||||
|
|
||||||
|
# Ensure all values in the 'tag_description' column are strings
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)
|
||||||
|
|
||||||
|
# Initial replacement mapping
|
||||||
|
initial_replacements = {
|
||||||
|
"MGE": "G/E",
|
||||||
|
"GEN.": "G/E",
|
||||||
|
"GEN": "G/E",
|
||||||
|
"GE": "G/E",
|
||||||
|
"G_E": "G/E",
|
||||||
|
"ME": "M/E",
|
||||||
|
"M_E": "M/E",
|
||||||
|
"S_G": "S/G",
|
||||||
|
"T_C": "T/C",
|
||||||
|
"TC": "T/C",
|
||||||
|
"L_O": "L.O",
|
||||||
|
"LO": "L.O",
|
||||||
|
"F_O": "F.O",
|
||||||
|
"FO": "F.O",
|
||||||
|
"D_G": "D/G",
|
||||||
|
"DG": "D/G",
|
||||||
|
"PP": "P/P"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Second replacement mapping
|
||||||
|
second_replacements = {
|
||||||
|
"_G/E": " G/E",
|
||||||
|
"G/E_": "G/E ",
|
||||||
|
"_M/E": " M/E",
|
||||||
|
"M/E_": "M/E ",
|
||||||
|
"_S/G": " S/G",
|
||||||
|
"S/G_": "S/G ",
|
||||||
|
"_T/C": " T/C",
|
||||||
|
"T/C_": "T/C ",
|
||||||
|
"_L.O": " L.O",
|
||||||
|
"L.O_": "L.O ",
|
||||||
|
"_F.O": " F.O",
|
||||||
|
"F.O_": "F.O ",
|
||||||
|
"_D/G": " D/G",
|
||||||
|
"D/G_": "D/G ",
|
||||||
|
"DG_": "DG "
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to separate numbers from text in a token
|
||||||
|
def separate_numbers_from_text(description):
|
||||||
|
# This regex pattern finds occurrences where text is followed by numbers or vice versa
|
||||||
|
return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description))
|
||||||
|
|
||||||
|
# Function to perform replacements using tokens
|
||||||
|
def replace_tokens(description, replacements):
|
||||||
|
tokens = description.split() # Tokenize by spaces
|
||||||
|
tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary
|
||||||
|
return ' '.join(tokens)
|
||||||
|
|
||||||
|
# Function to perform replacements for substrings
|
||||||
|
def replace_substrings(description, replacements):
|
||||||
|
for old, new in replacements.items():
|
||||||
|
description = description.replace(old, new)
|
||||||
|
return description
|
||||||
|
|
||||||
|
# Separate numbers from text before applying replacements
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)
|
||||||
|
|
||||||
|
# Apply initial replacements
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)
|
||||||
|
|
||||||
|
# Apply second replacements as substrings
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)
|
||||||
|
|
||||||
|
# Save the updated data_mapping to a new CSV file
|
||||||
|
output_file_path = '../exports/preprocessed_data.csv'
|
||||||
|
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
|
||||||
|
print(f"Updated data saved to {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -0,0 +1,78 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.16.4
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: torch
|
||||||
|
# language: python
|
||||||
|
# name: python3
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Load the data_mapping CSV file
|
||||||
|
data_mapping_file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||||||
|
data_mapping_file_path = 'outputs/raw_data_add_tag.csv' # Adjust this path to your actual file location
|
||||||
|
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
|
||||||
|
|
||||||
|
# Backup the original tag_description
|
||||||
|
data_mapping['org_tag_description'] = data_mapping['tag_description']
|
||||||
|
|
||||||
|
# Ensure all values in the 'tag_description' column are strings
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
|
||||||
|
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)
|
||||||
|
|
||||||
|
# Function to find tokens containing numbers
|
||||||
|
def find_tokens_with_numbers(description):
|
||||||
|
tokens = description.split() # Tokenize by spaces
|
||||||
|
number_tokens = [token for token in tokens if re.search(r'\d', token)]
|
||||||
|
return number_tokens
|
||||||
|
|
||||||
|
# Function to process tokens
|
||||||
|
def process_token(token):
|
||||||
|
# Step 1: Replace '_' or '-' adjacent to numbers with spaces
|
||||||
|
token = re.sub(r'(_|-)(?=\d)', ' ', token)
|
||||||
|
token = re.sub(r'(?<=\d)(_|-)', ' ', token)
|
||||||
|
|
||||||
|
# Step 2: Insert spaces between letters and numbers where no separator exists
|
||||||
|
token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token)
|
||||||
|
token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token)
|
||||||
|
|
||||||
|
# Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1"
|
||||||
|
token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token)
|
||||||
|
|
||||||
|
# Clean multiple spaces and strip
|
||||||
|
token = re.sub(r'\s+', ' ', token).strip()
|
||||||
|
return token
|
||||||
|
|
||||||
|
# Apply the process to each row in the 'tag_description' column
|
||||||
|
for index, row in data_mapping.iterrows():
|
||||||
|
original_description = row['tag_description']
|
||||||
|
number_tokens = find_tokens_with_numbers(original_description)
|
||||||
|
|
||||||
|
# Process each token containing numbers
|
||||||
|
processed_tokens = [process_token(token) for token in number_tokens]
|
||||||
|
|
||||||
|
# Replace the original tokens with processed tokens in the tag_description
|
||||||
|
new_description = original_description
|
||||||
|
for original_token, processed_token in zip(number_tokens, processed_tokens):
|
||||||
|
new_description = new_description.replace(original_token, processed_token)
|
||||||
|
|
||||||
|
# Update the data_mapping with the modified description
|
||||||
|
data_mapping.at[index, 'tag_description'] = new_description
|
||||||
|
|
||||||
|
# Save the updated data_mapping to a new CSV file
|
||||||
|
output_file_path = 'outputs/raw_data_s.csv'
|
||||||
|
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
|
||||||
|
print(f"Updated data saved to {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
|
@ -1,441 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Final Group Allocation:\n",
|
|
||||||
"Group 1: Ships_idx = [1025, 1032, 1042, 1046, 1023, 1037, 1024, 1014, 1019, 1008], PD type = 529, PD = 1992, SD = 9855\n",
|
|
||||||
"Group 2: Ships_idx = [1003, 1028, 1018, 1020, 1033, 1050, 1030, 1051, 1004, 1036], PD type = 528, PD = 2113, SD = 13074\n",
|
|
||||||
"Group 3: Ships_idx = [1016, 1026, 1043, 1031, 1012, 1021, 1000, 1011, 1006, 1005, 1038], PD type = 521, PD = 2140, SD = 10722\n",
|
|
||||||
"Group 4: Ships_idx = [1047, 1049, 1010, 1027, 1013, 1022, 1048, 1017, 1045, 1007], PD type = 521, PD = 2102, SD = 15451\n",
|
|
||||||
"Group 5: Ships_idx = [1039, 1035, 1044, 1009, 1015, 1040, 1001, 1034, 1041, 1002, 1029], PD type = 500, PD = 2183, SD = 12969\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from collections import defaultdict\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to calculate the number of unique combinations and total count for each ship\n",
|
|
||||||
"def calculate_ship_count(group):\n",
|
|
||||||
" ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index()\n",
|
|
||||||
" ship_count.columns = ['ships_idx', 'comb_count', 'total_count']\n",
|
|
||||||
" return ship_count\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to calculate the combination count and total count for a group\n",
|
|
||||||
"def calculate_group_count(group):\n",
|
|
||||||
" comb_count = group['thing_property'].nunique()\n",
|
|
||||||
" total_count = group['thing_property'].size\n",
|
|
||||||
" return comb_count, total_count\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to calculate the increase in combination count when a ship is added to a group\n",
|
|
||||||
"def calculate_comb_count_increase(groups, g, ship_idx, mdm):\n",
|
|
||||||
" temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n",
|
|
||||||
" temp_groups[g].append(ship_idx)\n",
|
|
||||||
" \n",
|
|
||||||
" group_ships = temp_groups[g]\n",
|
|
||||||
" group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" \n",
|
|
||||||
" new_comb_count, _ = calculate_group_count(group_data)\n",
|
|
||||||
" \n",
|
|
||||||
" current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
|
|
||||||
" current_comb_count, _ = calculate_group_count(current_group_data)\n",
|
|
||||||
" \n",
|
|
||||||
" increase = new_comb_count - current_comb_count\n",
|
|
||||||
" \n",
|
|
||||||
" return increase\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to calculate the increase in total count when a ship is added to a group\n",
|
|
||||||
"def calculate_total_count_increase(groups, g, ship_idx, mdm):\n",
|
|
||||||
" temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n",
|
|
||||||
" temp_groups[g].append(ship_idx)\n",
|
|
||||||
" \n",
|
|
||||||
" group_ships = temp_groups[g]\n",
|
|
||||||
" group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" \n",
|
|
||||||
" _, new_total_count = calculate_group_count(group_data)\n",
|
|
||||||
" \n",
|
|
||||||
" current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
|
|
||||||
" _, current_total_count = calculate_group_count(current_group_data)\n",
|
|
||||||
" \n",
|
|
||||||
" increase = new_total_count - current_total_count\n",
|
|
||||||
" \n",
|
|
||||||
" return increase\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to find the ship that will bring the total count closest to the target\n",
|
|
||||||
"def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count):\n",
|
|
||||||
" total_count_differences = []\n",
|
|
||||||
"\n",
|
|
||||||
" current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
|
|
||||||
" _, current_total_count = calculate_group_count(current_group_data)\n",
|
|
||||||
"\n",
|
|
||||||
" for ship_idx in remaining_ships:\n",
|
|
||||||
" increase = calculate_total_count_increase(groups, g, ship_idx, mdm)\n",
|
|
||||||
" new_total_count = current_total_count + increase\n",
|
|
||||||
" difference = abs(target_total_count - new_total_count)\n",
|
|
||||||
" total_count_differences.append((ship_idx, difference, increase))\n",
|
|
||||||
"\n",
|
|
||||||
" if not total_count_differences:\n",
|
|
||||||
" return None, 0\n",
|
|
||||||
" \n",
|
|
||||||
" closest_ship = min(total_count_differences, key=lambda x: x[1])\n",
|
|
||||||
" selected_ship_idx, _, selected_increase = closest_ship\n",
|
|
||||||
"\n",
|
|
||||||
" return selected_ship_idx, selected_increase\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to find the ship that gives the maximum increase in combination count\n",
|
|
||||||
"def find_max_increase_ship(groups, g, remaining_ships, mdm):\n",
|
|
||||||
" comb_count_increase = []\n",
|
|
||||||
"\n",
|
|
||||||
" for ship_idx in remaining_ships:\n",
|
|
||||||
" increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n",
|
|
||||||
" comb_count_increase.append((ship_idx, increase))\n",
|
|
||||||
"\n",
|
|
||||||
" max_increase_ship = max(comb_count_increase, key=lambda x: x[1])\n",
|
|
||||||
" selected_ship_idx, max_increase = max_increase_ship\n",
|
|
||||||
" \n",
|
|
||||||
" return selected_ship_idx, max_increase\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to find the ship that will bring the combination count closest to the target\n",
|
|
||||||
"def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count):\n",
|
|
||||||
" comb_count_differences = []\n",
|
|
||||||
"\n",
|
|
||||||
" current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n",
|
|
||||||
" current_comb_count, _ = calculate_group_count(current_group_data)\n",
|
|
||||||
"\n",
|
|
||||||
" for ship_idx in remaining_ships:\n",
|
|
||||||
" increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n",
|
|
||||||
" new_comb_count = current_comb_count + increase\n",
|
|
||||||
" difference = abs(target_comb_count - new_comb_count)\n",
|
|
||||||
" comb_count_differences.append((ship_idx, difference, increase))\n",
|
|
||||||
"\n",
|
|
||||||
" if not comb_count_differences:\n",
|
|
||||||
" return None, 0\n",
|
|
||||||
"\n",
|
|
||||||
" closest_ship = min(comb_count_differences, key=lambda x: x[1])\n",
|
|
||||||
" selected_ship_idx, _, selected_increase = closest_ship\n",
|
|
||||||
"\n",
|
|
||||||
" return selected_ship_idx, selected_increase\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to find the group with the maximum combination count\n",
|
|
||||||
"def find_group_with_max_comb_count(groups, mdm):\n",
|
|
||||||
" max_comb_count = -1\n",
|
|
||||||
" max_group_idx = -1\n",
|
|
||||||
"\n",
|
|
||||||
" for g in range(len(groups)):\n",
|
|
||||||
" group_ships = groups[g]\n",
|
|
||||||
" group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" comb_count, _ = calculate_group_count(group_data)\n",
|
|
||||||
" \n",
|
|
||||||
" if comb_count > max_comb_count:\n",
|
|
||||||
" max_comb_count = comb_count\n",
|
|
||||||
" max_group_idx = g\n",
|
|
||||||
"\n",
|
|
||||||
" return max_group_idx, max_comb_count\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to find the group with the maximum total count\n",
|
|
||||||
"def find_group_with_max_total_count(groups, mdm):\n",
|
|
||||||
" max_total_count = -1\n",
|
|
||||||
" max_group_idx = -1\n",
|
|
||||||
"\n",
|
|
||||||
" for g in range(len(groups)):\n",
|
|
||||||
" group_ships = groups[g]\n",
|
|
||||||
" group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" _, total_count = calculate_group_count(group_data)\n",
|
|
||||||
" \n",
|
|
||||||
" if total_count > max_total_count:\n",
|
|
||||||
" max_total_count = total_count\n",
|
|
||||||
" max_group_idx = g\n",
|
|
||||||
"\n",
|
|
||||||
" return max_group_idx, max_total_count\n",
|
|
||||||
"\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from collections import defaultdict\n",
|
|
||||||
"\n",
|
|
||||||
"# Load the CSV file\n",
|
|
||||||
"data_file_path = 'preprocessed_data.csv'\n",
|
|
||||||
"data = pd.read_csv(data_file_path)\n",
|
|
||||||
"\n",
|
|
||||||
"# Filter the data where MDM is True\n",
|
|
||||||
"mdm_true = data[data['MDM'] == True].copy() # .copy()를 사용하여 명시적으로 복사본 생성\n",
|
|
||||||
"mdm_all = data.copy()\n",
|
|
||||||
"\n",
|
|
||||||
"# Create a new column combining 'thing' and 'property'\n",
|
|
||||||
"mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property']\n",
|
|
||||||
"mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property']\n",
|
|
||||||
"\n",
|
|
||||||
"# Initial setup for groups\n",
|
|
||||||
"ship_count = calculate_ship_count(mdm_true)\n",
|
|
||||||
"num_groups = 5\n",
|
|
||||||
"groups = defaultdict(list)\n",
|
|
||||||
"\n",
|
|
||||||
"# Sort ships by combination count in descending order\n",
|
|
||||||
"sorted_ships = ship_count.sort_values(by='comb_count', ascending=False)\n",
|
|
||||||
"\n",
|
|
||||||
"# Assign the first 5 ships to the groups\n",
|
|
||||||
"for i in range(num_groups):\n",
|
|
||||||
" groups[i].append(sorted_ships.iloc[i]['ships_idx'])\n",
|
|
||||||
"\n",
|
|
||||||
"remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values\n",
|
|
||||||
"\n",
|
|
||||||
"# Allocate remaining ships to the groups\n",
|
|
||||||
"while len(remaining_ships) > 0:\n",
|
|
||||||
" group_comb_counts = []\n",
|
|
||||||
" for g in range(num_groups):\n",
|
|
||||||
" group_ships = groups[g]\n",
|
|
||||||
" group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" comb_count, _ = calculate_group_count(group_data)\n",
|
|
||||||
" group_comb_counts.append((g, comb_count))\n",
|
|
||||||
"\n",
|
|
||||||
" group_comb_counts.sort(key=lambda x: x[1])\n",
|
|
||||||
" \n",
|
|
||||||
" remaining_group = []\n",
|
|
||||||
" for g, _ in group_comb_counts:\n",
|
|
||||||
" if len(remaining_ships) == 0:\n",
|
|
||||||
" break\n",
|
|
||||||
" \n",
|
|
||||||
" if group_comb_counts.index((g, _)) == 0:\n",
|
|
||||||
" selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)\n",
|
|
||||||
" \n",
|
|
||||||
" else:\n",
|
|
||||||
" max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)\n",
|
|
||||||
" selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)\n",
|
|
||||||
"\n",
|
|
||||||
" if comb_increase == 0:\n",
|
|
||||||
" remaining_group.append(g)\n",
|
|
||||||
" else:\n",
|
|
||||||
" groups[g].append(selected_ship_idx)\n",
|
|
||||||
" remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n",
|
|
||||||
"\n",
|
|
||||||
" for g in remaining_group:\n",
|
|
||||||
" if len(remaining_ships) == 0:\n",
|
|
||||||
" break\n",
|
|
||||||
" max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true)\n",
|
|
||||||
" selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count)\n",
|
|
||||||
" if selected_ship_idx is not None:\n",
|
|
||||||
" groups[g].append(selected_ship_idx)\n",
|
|
||||||
" remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n",
|
|
||||||
"\n",
|
|
||||||
"# Calculate comb_count for each group and store it in a list\n",
|
|
||||||
"group_comb_counts = []\n",
|
|
||||||
"for g in range(num_groups):\n",
|
|
||||||
" group_ships = groups[g]\n",
|
|
||||||
" group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" comb_count, total_count = calculate_group_count(group_data_true)\n",
|
|
||||||
"\n",
|
|
||||||
" # Calculate total count including MDM=False\n",
|
|
||||||
" group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" _, total_count_all = calculate_group_count(group_data_all)\n",
|
|
||||||
" \n",
|
|
||||||
" group_comb_counts.append((g, comb_count, total_count_all))\n",
|
|
||||||
"\n",
|
|
||||||
"# Sort the groups by comb_count in descending order\n",
|
|
||||||
"group_comb_counts.sort(key=lambda x: x[1], reverse=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# Reorder the groups dictionary based on the sorted order\n",
|
|
||||||
"sorted_groups = defaultdict(list)\n",
|
|
||||||
"for i, (g, _, _) in enumerate(group_comb_counts):\n",
|
|
||||||
" sorted_groups[i] = groups[g]\n",
|
|
||||||
"\n",
|
|
||||||
"# Final output of group allocation\n",
|
|
||||||
"print(\"Final Group Allocation:\")\n",
|
|
||||||
"for g in range(num_groups):\n",
|
|
||||||
" group_ships = sorted_groups[g]\n",
|
|
||||||
" group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" comb_count, total_count = calculate_group_count(group_data_true)\n",
|
|
||||||
"\n",
|
|
||||||
" # Calculate total count including MDM=False\n",
|
|
||||||
" group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" _, total_count_all = calculate_group_count(group_data_all)\n",
|
|
||||||
"\n",
|
|
||||||
" print(f\"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}\")\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"CSV file has been generated: 'combined_group_allocation.csv'\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from sklearn.model_selection import GroupKFold\n",
|
|
||||||
"\n",
|
|
||||||
"# Prepare data for custom group allocation (BGKF)\n",
|
|
||||||
"comb_counts = []\n",
|
|
||||||
"total_counts = []\n",
|
|
||||||
"ship_counts = []\n",
|
|
||||||
"custom_results = []\n",
|
|
||||||
"\n",
|
|
||||||
"for g in range(num_groups):\n",
|
|
||||||
" group_ships = groups[g]\n",
|
|
||||||
" group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" comb_count, total_count = calculate_group_count(group_data_true)\n",
|
|
||||||
" \n",
|
|
||||||
" # Calculate total count including MDM=False\n",
|
|
||||||
" group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n",
|
|
||||||
" _, total_count_all = calculate_group_count(group_data_all)\n",
|
|
||||||
" \n",
|
|
||||||
" custom_results.append({\n",
|
|
||||||
" 'Group': g + 1,\n",
|
|
||||||
" 'Allocation': 'BGKF',\n",
|
|
||||||
" 'Comb_count': comb_count,\n",
|
|
||||||
" 'Total_count': total_count,\n",
|
|
||||||
" 'Total_count_all': total_count_all,\n",
|
|
||||||
" 'Ship_count': len(group_ships),\n",
|
|
||||||
" 'Ships_idx': list(group_ships)\n",
|
|
||||||
" })\n",
|
|
||||||
"\n",
|
|
||||||
"# Sort the custom group allocation by comb_count in descending order\n",
|
|
||||||
"custom_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# Adjust group numbers after sorting\n",
|
|
||||||
"for i, result in enumerate(custom_results):\n",
|
|
||||||
" result['Group'] = i + 1\n",
|
|
||||||
"\n",
|
|
||||||
"# Prepare data for GroupKFold allocation (GKF)\n",
|
|
||||||
"gkf = GroupKFold(n_splits=5)\n",
|
|
||||||
"gkf_results = []\n",
|
|
||||||
"\n",
|
|
||||||
"for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])):\n",
|
|
||||||
" test_group = mdm_true.iloc[test_idx]\n",
|
|
||||||
" comb_count, total_count = calculate_group_count(test_group)\n",
|
|
||||||
" \n",
|
|
||||||
" # Calculate total count including MDM=False\n",
|
|
||||||
" test_group_ships = test_group['ships_idx'].unique()\n",
|
|
||||||
" test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)]\n",
|
|
||||||
" _, total_count_all = calculate_group_count(test_group_all)\n",
|
|
||||||
" \n",
|
|
||||||
" gkf_results.append({\n",
|
|
||||||
" 'Group': i + 1,\n",
|
|
||||||
" 'Allocation': 'GKF',\n",
|
|
||||||
" 'Comb_count': comb_count,\n",
|
|
||||||
" 'Total_count': total_count,\n",
|
|
||||||
" 'Total_count_all': total_count_all,\n",
|
|
||||||
" 'Ship_count': test_group['ships_idx'].nunique(),\n",
|
|
||||||
" 'Ships_idx': list(test_group['ships_idx'].unique())\n",
|
|
||||||
" })\n",
|
|
||||||
"\n",
|
|
||||||
"# Sort the GKF allocation by comb_count in descending order\n",
|
|
||||||
"gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# Adjust group numbers after sorting\n",
|
|
||||||
"for i, result in enumerate(gkf_results):\n",
|
|
||||||
" result['Group'] = i + 1\n",
|
|
||||||
"\n",
|
|
||||||
"# Combine BGKF and GKF results into one DataFrame\n",
|
|
||||||
"combined_results = custom_results + gkf_results\n",
|
|
||||||
"combined_df = pd.DataFrame(combined_results)\n",
|
|
||||||
"\n",
|
|
||||||
"# Output the combined results to a single CSV file\n",
|
|
||||||
"combined_df.to_csv('combined_group_allocation.csv', index=False)\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"CSV file has been generated: 'combined_group_allocation.csv'\")\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Group 1 datasets saved in dataset/1\n",
|
|
||||||
"Group 2 datasets saved in dataset/2\n",
|
|
||||||
"Group 3 datasets saved in dataset/3\n",
|
|
||||||
"Group 4 datasets saved in dataset/4\n",
|
|
||||||
"Group 5 datasets saved in dataset/5\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from sklearn.model_selection import KFold\n",
|
|
||||||
"\n",
|
|
||||||
"def save_datasets_for_group(groups, mdm, data, output_dir='dataset', n_splits=4):\n",
|
|
||||||
" for i in range(len(groups)):\n",
|
|
||||||
" group_folder = os.path.join(output_dir, str(i + 1))\n",
|
|
||||||
" os.makedirs(group_folder, exist_ok=True)\n",
|
|
||||||
" \n",
|
|
||||||
" # Create the test dataset by including only group i\n",
|
|
||||||
" test_group_ships = groups[i]\n",
|
|
||||||
" test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]\n",
|
|
||||||
" \n",
|
|
||||||
" # Extract corresponding entries from the external test dataset\n",
|
|
||||||
" test_all_data = data[data['ships_idx'].isin(test_group_ships)]\n",
|
|
||||||
" \n",
|
|
||||||
" # Create the train dataset by excluding group i\n",
|
|
||||||
" train_group_ships = []\n",
|
|
||||||
" for g in range(len(groups)):\n",
|
|
||||||
" if g != i:\n",
|
|
||||||
" train_group_ships.extend(groups[g])\n",
|
|
||||||
" train_data = mdm[mdm['ships_idx'].isin(train_group_ships)]\n",
|
|
||||||
" \n",
|
|
||||||
" # Use KFold to split train_data into train and valid datasets\n",
|
|
||||||
" kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42)\n",
|
|
||||||
" train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data))\n",
|
|
||||||
" \n",
|
|
||||||
" final_train_data = train_data.iloc[train_idx_inner]\n",
|
|
||||||
" valid_data = train_data.iloc[valid_idx_inner]\n",
|
|
||||||
" \n",
|
|
||||||
" # Combine train and valid data to create train_all\n",
|
|
||||||
" train_all_data = pd.concat([final_train_data, valid_data])\n",
|
|
||||||
" \n",
|
|
||||||
" # Save datasets to CSV files\n",
|
|
||||||
" train_file_path = os.path.join(group_folder, 'train.csv')\n",
|
|
||||||
" valid_file_path = os.path.join(group_folder, 'valid.csv')\n",
|
|
||||||
" test_file_path = os.path.join(group_folder, 'test.csv')\n",
|
|
||||||
" test_all_file_path = os.path.join(group_folder, 'test_all.csv')\n",
|
|
||||||
" train_all_file_path = os.path.join(group_folder, 'train_all.csv')\n",
|
|
||||||
" \n",
|
|
||||||
" final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
" valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
" # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
" test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
" train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')\n",
|
|
||||||
" \n",
|
|
||||||
" print(f\"Group {i + 1} datasets saved in {group_folder}\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Example usage:\n",
|
|
||||||
"save_datasets_for_group(groups, mdm_true, data, n_splits=4)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "torch",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.14"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
|
@ -0,0 +1,382 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.16.4
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: torch
|
||||||
|
# language: python
|
||||||
|
# name: python3
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Function to calculate the number of unique combinations and total count for each ship
|
||||||
|
def calculate_ship_count(group):
|
||||||
|
ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index()
|
||||||
|
ship_count.columns = ['ships_idx', 'comb_count', 'total_count']
|
||||||
|
return ship_count
|
||||||
|
|
||||||
|
# Function to calculate the combination count and total count for a group
|
||||||
|
def calculate_group_count(group):
|
||||||
|
comb_count = group['thing_property'].nunique()
|
||||||
|
total_count = group['thing_property'].size
|
||||||
|
return comb_count, total_count
|
||||||
|
|
||||||
|
# Function to calculate the increase in combination count when a ship is added to a group
|
||||||
|
def calculate_comb_count_increase(groups, g, ship_idx, mdm):
|
||||||
|
temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})
|
||||||
|
temp_groups[g].append(ship_idx)
|
||||||
|
|
||||||
|
group_ships = temp_groups[g]
|
||||||
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
||||||
|
|
||||||
|
new_comb_count, _ = calculate_group_count(group_data)
|
||||||
|
|
||||||
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
||||||
|
current_comb_count, _ = calculate_group_count(current_group_data)
|
||||||
|
|
||||||
|
increase = new_comb_count - current_comb_count
|
||||||
|
|
||||||
|
return increase
|
||||||
|
|
||||||
|
# Function to calculate the increase in total count when a ship is added to a group
|
||||||
|
def calculate_total_count_increase(groups, g, ship_idx, mdm):
|
||||||
|
temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})
|
||||||
|
temp_groups[g].append(ship_idx)
|
||||||
|
|
||||||
|
group_ships = temp_groups[g]
|
||||||
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
||||||
|
|
||||||
|
_, new_total_count = calculate_group_count(group_data)
|
||||||
|
|
||||||
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
||||||
|
_, current_total_count = calculate_group_count(current_group_data)
|
||||||
|
|
||||||
|
increase = new_total_count - current_total_count
|
||||||
|
|
||||||
|
return increase
|
||||||
|
|
||||||
|
# Function to find the ship that will bring the total count closest to the target
|
||||||
|
def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count):
|
||||||
|
total_count_differences = []
|
||||||
|
|
||||||
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
||||||
|
_, current_total_count = calculate_group_count(current_group_data)
|
||||||
|
|
||||||
|
for ship_idx in remaining_ships:
|
||||||
|
increase = calculate_total_count_increase(groups, g, ship_idx, mdm)
|
||||||
|
new_total_count = current_total_count + increase
|
||||||
|
difference = abs(target_total_count - new_total_count)
|
||||||
|
total_count_differences.append((ship_idx, difference, increase))
|
||||||
|
|
||||||
|
if not total_count_differences:
|
||||||
|
return None, 0
|
||||||
|
|
||||||
|
closest_ship = min(total_count_differences, key=lambda x: x[1])
|
||||||
|
selected_ship_idx, _, selected_increase = closest_ship
|
||||||
|
|
||||||
|
return selected_ship_idx, selected_increase
|
||||||
|
|
||||||
|
# Function to find the ship that gives the maximum increase in combination count
|
||||||
|
def find_max_increase_ship(groups, g, remaining_ships, mdm):
|
||||||
|
comb_count_increase = []
|
||||||
|
|
||||||
|
for ship_idx in remaining_ships:
|
||||||
|
increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)
|
||||||
|
comb_count_increase.append((ship_idx, increase))
|
||||||
|
|
||||||
|
max_increase_ship = max(comb_count_increase, key=lambda x: x[1])
|
||||||
|
selected_ship_idx, max_increase = max_increase_ship
|
||||||
|
|
||||||
|
return selected_ship_idx, max_increase
|
||||||
|
|
||||||
|
# Function to find the ship that will bring the combination count closest to the target
|
||||||
|
def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count):
|
||||||
|
comb_count_differences = []
|
||||||
|
|
||||||
|
current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]
|
||||||
|
current_comb_count, _ = calculate_group_count(current_group_data)
|
||||||
|
|
||||||
|
for ship_idx in remaining_ships:
|
||||||
|
increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)
|
||||||
|
new_comb_count = current_comb_count + increase
|
||||||
|
difference = abs(target_comb_count - new_comb_count)
|
||||||
|
comb_count_differences.append((ship_idx, difference, increase))
|
||||||
|
|
||||||
|
if not comb_count_differences:
|
||||||
|
return None, 0
|
||||||
|
|
||||||
|
closest_ship = min(comb_count_differences, key=lambda x: x[1])
|
||||||
|
selected_ship_idx, _, selected_increase = closest_ship
|
||||||
|
|
||||||
|
return selected_ship_idx, selected_increase
|
||||||
|
|
||||||
|
# Function to find the group with the maximum combination count
|
||||||
|
def find_group_with_max_comb_count(groups, mdm):
|
||||||
|
max_comb_count = -1
|
||||||
|
max_group_idx = -1
|
||||||
|
|
||||||
|
for g in range(len(groups)):
|
||||||
|
group_ships = groups[g]
|
||||||
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
||||||
|
comb_count, _ = calculate_group_count(group_data)
|
||||||
|
|
||||||
|
if comb_count > max_comb_count:
|
||||||
|
max_comb_count = comb_count
|
||||||
|
max_group_idx = g
|
||||||
|
|
||||||
|
return max_group_idx, max_comb_count
|
||||||
|
|
||||||
|
# Function to find the group with the maximum total count
|
||||||
|
def find_group_with_max_total_count(groups, mdm):
|
||||||
|
max_total_count = -1
|
||||||
|
max_group_idx = -1
|
||||||
|
|
||||||
|
for g in range(len(groups)):
|
||||||
|
group_ships = groups[g]
|
||||||
|
group_data = mdm[mdm['ships_idx'].isin(group_ships)]
|
||||||
|
_, total_count = calculate_group_count(group_data)
|
||||||
|
|
||||||
|
if total_count > max_total_count:
|
||||||
|
max_total_count = total_count
|
||||||
|
max_group_idx = g
|
||||||
|
|
||||||
|
return max_group_idx, max_total_count
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Load the CSV file
|
||||||
|
data_file_path = 'exports/preprocessed_data.csv'
|
||||||
|
data = pd.read_csv(data_file_path)
|
||||||
|
|
||||||
|
# Filter the data where MDM is True
|
||||||
|
mdm_true = data[data['MDM'] == True].copy() # .copy()를 사용하여 명시적으로 복사본 생성
|
||||||
|
mdm_all = data.copy()
|
||||||
|
|
||||||
|
# Create a new column combining 'thing' and 'property'
|
||||||
|
mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property']
|
||||||
|
mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property']
|
||||||
|
|
||||||
|
# Initial setup for groups
|
||||||
|
ship_count = calculate_ship_count(mdm_true)
|
||||||
|
num_groups = 5
|
||||||
|
groups = defaultdict(list)
|
||||||
|
|
||||||
|
# Sort ships by combination count in descending order
|
||||||
|
sorted_ships = ship_count.sort_values(by='comb_count', ascending=False)
|
||||||
|
|
||||||
|
# Assign the first 5 ships to the groups
|
||||||
|
for i in range(num_groups):
|
||||||
|
groups[i].append(sorted_ships.iloc[i]['ships_idx'])
|
||||||
|
|
||||||
|
remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values
|
||||||
|
|
||||||
|
# Allocate remaining ships to the groups
|
||||||
|
while len(remaining_ships) > 0:
|
||||||
|
group_comb_counts = []
|
||||||
|
for g in range(num_groups):
|
||||||
|
group_ships = groups[g]
|
||||||
|
group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
||||||
|
comb_count, _ = calculate_group_count(group_data)
|
||||||
|
group_comb_counts.append((g, comb_count))
|
||||||
|
|
||||||
|
group_comb_counts.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
|
remaining_group = []
|
||||||
|
for g, _ in group_comb_counts:
|
||||||
|
if len(remaining_ships) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if group_comb_counts.index((g, _)) == 0:
|
||||||
|
selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)
|
||||||
|
|
||||||
|
else:
|
||||||
|
max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)
|
||||||
|
selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)
|
||||||
|
|
||||||
|
if comb_increase == 0:
|
||||||
|
remaining_group.append(g)
|
||||||
|
else:
|
||||||
|
groups[g].append(selected_ship_idx)
|
||||||
|
remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]
|
||||||
|
|
||||||
|
for g in remaining_group:
|
||||||
|
if len(remaining_ships) == 0:
|
||||||
|
break
|
||||||
|
max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true)
|
||||||
|
selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count)
|
||||||
|
if selected_ship_idx is not None:
|
||||||
|
groups[g].append(selected_ship_idx)
|
||||||
|
remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]
|
||||||
|
|
||||||
|
# Calculate comb_count for each group and store it in a list
|
||||||
|
group_comb_counts = []
|
||||||
|
for g in range(num_groups):
|
||||||
|
group_ships = groups[g]
|
||||||
|
group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
||||||
|
comb_count, total_count = calculate_group_count(group_data_true)
|
||||||
|
|
||||||
|
# Calculate total count including MDM=False
|
||||||
|
group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]
|
||||||
|
_, total_count_all = calculate_group_count(group_data_all)
|
||||||
|
|
||||||
|
group_comb_counts.append((g, comb_count, total_count_all))
|
||||||
|
|
||||||
|
# Sort the groups by comb_count in descending order
|
||||||
|
group_comb_counts.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
# Reorder the groups dictionary based on the sorted order
|
||||||
|
sorted_groups = defaultdict(list)
|
||||||
|
for i, (g, _, _) in enumerate(group_comb_counts):
|
||||||
|
sorted_groups[i] = groups[g]
|
||||||
|
|
||||||
|
# Final output of group allocation
|
||||||
|
print("Final Group Allocation:")
|
||||||
|
for g in range(num_groups):
|
||||||
|
group_ships = sorted_groups[g]
|
||||||
|
group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
||||||
|
comb_count, total_count = calculate_group_count(group_data_true)
|
||||||
|
|
||||||
|
# Calculate total count including MDM=False
|
||||||
|
group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]
|
||||||
|
_, total_count_all = calculate_group_count(group_data_all)
|
||||||
|
|
||||||
|
print(f"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GroupKFold
|
||||||
|
|
||||||
|
# Prepare data for custom group allocation (BGKF)
|
||||||
|
comb_counts = []
|
||||||
|
total_counts = []
|
||||||
|
ship_counts = []
|
||||||
|
custom_results = []
|
||||||
|
|
||||||
|
for g in range(num_groups):
|
||||||
|
group_ships = groups[g]
|
||||||
|
group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]
|
||||||
|
comb_count, total_count = calculate_group_count(group_data_true)
|
||||||
|
|
||||||
|
# Calculate total count including MDM=False
|
||||||
|
group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]
|
||||||
|
_, total_count_all = calculate_group_count(group_data_all)
|
||||||
|
|
||||||
|
custom_results.append({
|
||||||
|
'Group': g + 1,
|
||||||
|
'Allocation': 'BGKF',
|
||||||
|
'Comb_count': comb_count,
|
||||||
|
'Total_count': total_count,
|
||||||
|
'Total_count_all': total_count_all,
|
||||||
|
'Ship_count': len(group_ships),
|
||||||
|
'Ships_idx': list(group_ships)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort the custom group allocation by comb_count in descending order
|
||||||
|
custom_results.sort(key=lambda x: x['Comb_count'], reverse=True)
|
||||||
|
|
||||||
|
# Adjust group numbers after sorting
|
||||||
|
for i, result in enumerate(custom_results):
|
||||||
|
result['Group'] = i + 1
|
||||||
|
|
||||||
|
# Prepare data for GroupKFold allocation (GKF)
|
||||||
|
gkf = GroupKFold(n_splits=5)
|
||||||
|
gkf_results = []
|
||||||
|
|
||||||
|
for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])):
|
||||||
|
test_group = mdm_true.iloc[test_idx]
|
||||||
|
comb_count, total_count = calculate_group_count(test_group)
|
||||||
|
|
||||||
|
# Calculate total count including MDM=False
|
||||||
|
test_group_ships = test_group['ships_idx'].unique()
|
||||||
|
test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)]
|
||||||
|
_, total_count_all = calculate_group_count(test_group_all)
|
||||||
|
|
||||||
|
gkf_results.append({
|
||||||
|
'Group': i + 1,
|
||||||
|
'Allocation': 'GKF',
|
||||||
|
'Comb_count': comb_count,
|
||||||
|
'Total_count': total_count,
|
||||||
|
'Total_count_all': total_count_all,
|
||||||
|
'Ship_count': test_group['ships_idx'].nunique(),
|
||||||
|
'Ships_idx': list(test_group['ships_idx'].unique())
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort the GKF allocation by comb_count in descending order
|
||||||
|
gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True)
|
||||||
|
|
||||||
|
# Adjust group numbers after sorting
|
||||||
|
for i, result in enumerate(gkf_results):
|
||||||
|
result['Group'] = i + 1
|
||||||
|
|
||||||
|
# Combine BGKF and GKF results into one DataFrame
|
||||||
|
combined_results = custom_results + gkf_results
|
||||||
|
combined_df = pd.DataFrame(combined_results)
|
||||||
|
|
||||||
|
# Output the combined results to a single CSV file
|
||||||
|
combined_df.to_csv('exports/combined_group_allocation.csv', index=False)
|
||||||
|
|
||||||
|
print("CSV file has been generated: 'combined_group_allocation.csv'")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import KFold
|
||||||
|
|
||||||
|
def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_splits=4):
|
||||||
|
for i in range(len(groups)):
|
||||||
|
group_folder = os.path.join(output_dir, 'group' + '_' + str(i + 1))
|
||||||
|
os.makedirs(group_folder, exist_ok=True)
|
||||||
|
|
||||||
|
# Create the test dataset by including only group i
|
||||||
|
test_group_ships = groups[i]
|
||||||
|
test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]
|
||||||
|
|
||||||
|
# Extract corresponding entries from the external test dataset
|
||||||
|
test_all_data = data[data['ships_idx'].isin(test_group_ships)]
|
||||||
|
|
||||||
|
# Create the train dataset by excluding group i
|
||||||
|
train_group_ships = []
|
||||||
|
for g in range(len(groups)):
|
||||||
|
if g != i:
|
||||||
|
train_group_ships.extend(groups[g])
|
||||||
|
train_data = mdm[mdm['ships_idx'].isin(train_group_ships)]
|
||||||
|
|
||||||
|
# Use KFold to split train_data into train and valid datasets
|
||||||
|
kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42)
|
||||||
|
train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data))
|
||||||
|
|
||||||
|
final_train_data = train_data.iloc[train_idx_inner]
|
||||||
|
valid_data = train_data.iloc[valid_idx_inner]
|
||||||
|
|
||||||
|
# Combine train and valid data to create train_all
|
||||||
|
train_all_data = pd.concat([final_train_data, valid_data])
|
||||||
|
|
||||||
|
# Save datasets to CSV files
|
||||||
|
train_file_path = os.path.join(group_folder, 'train.csv')
|
||||||
|
valid_file_path = os.path.join(group_folder, 'valid.csv')
|
||||||
|
test_file_path = os.path.join(group_folder, 'test.csv')
|
||||||
|
test_all_file_path = os.path.join(group_folder, 'test_all.csv')
|
||||||
|
train_all_file_path = os.path.join(group_folder, 'train_all.csv')
|
||||||
|
|
||||||
|
final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
# test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')
|
||||||
|
|
||||||
|
print(f"Group {i + 1} datasets saved in {group_folder}")
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
save_datasets_for_group(groups, mdm_true, data, output_dir='exports/dataset', n_splits=4)
|
||||||
|
|
Loading…
Reference in New Issue