diff --git a/data_import/.gitignore b/data_import/.gitignore index e5538bb..c2166c4 100644 --- a/data_import/.gitignore +++ b/data_import/.gitignore @@ -1,3 +1 @@ db_connection_info.txt -exports/* -outputs/* diff --git a/data_import/exports/.gitignore b/data_import/exports/.gitignore new file mode 100644 index 0000000..a5baada --- /dev/null +++ b/data_import/exports/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore + diff --git a/data_import/outputs/.gitignore b/data_import/outputs/.gitignore new file mode 100644 index 0000000..a5baada --- /dev/null +++ b/data_import/outputs/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore + diff --git a/data_preprocess/.gitignore b/data_preprocess/.gitignore new file mode 100644 index 0000000..5841853 --- /dev/null +++ b/data_preprocess/.gitignore @@ -0,0 +1,2 @@ +*.ipynb + diff --git a/data_preprocess/README.md b/data_preprocess/README.md new file mode 100644 index 0000000..fa51cb6 --- /dev/null +++ b/data_preprocess/README.md @@ -0,0 +1,27 @@ +# Data Preprocess + +## What is this folder + +This folder contains the files for pre-processing. + +We divide each processing method into their respective folders to modularize the +pre-processing methods. This helps to make it easier to test different methods +and reduce coupling between stages. + +## Instructions + +First, we apply the pre-processing by running code from the desired folder. + +Using `no_preprocess` directory as an example: + +- `cd no_preprocess` +- Follow the instructions found in the sub-directory +- After code execution, the processed file will be placed into +`exports/preprocessed_data.csv` + +We then run the data split code to create our k-fold splits. + +- `cd` back to the `data_preprocess` directory +- `python split_data.py` + +You will now have the datasets in `exports/dataset/group_{1,2,3,4,5}` \ No newline at end of file diff --git a/data_preprocess/exports/.gitignore b/data_preprocess/exports/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/data_preprocess/exports/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/data_preprocess/no_preprocess/README.md b/data_preprocess/no_preprocess/README.md new file mode 100644 index 0000000..6e488e0 --- /dev/null +++ b/data_preprocess/no_preprocess/README.md @@ -0,0 +1,7 @@ +## Purpose: + +Disables pre-processing + +## Instructions: + +- `python copy_raw_data.py` \ No newline at end of file diff --git a/data_preprocess/no_preprocess/copy_raw_data.py b/data_preprocess/no_preprocess/copy_raw_data.py index c4d60da..2511dc6 100644 --- a/data_preprocess/no_preprocess/copy_raw_data.py +++ b/data_preprocess/no_preprocess/copy_raw_data.py @@ -1,8 +1,8 @@ import shutil -source_file = 'data_import/raw_data.csv' +source_file = '../../data_import/exports/raw_data.csv' -destination_file = 'data_preprocess/preprocessed_data.csv' +destination_file = '../exports/preprocessed_data.csv' shutil.copy(source_file, destination_file) diff --git a/data_preprocess/rule_base_replacement/.gitignore b/data_preprocess/rule_base_replacement/.gitignore new file mode 100644 index 0000000..fa65608 --- /dev/null +++ b/data_preprocess/rule_base_replacement/.gitignore @@ -0,0 +1 @@ +*.ipynb diff --git a/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb b/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb deleted file mode 100644 index 3d1baa9..0000000 --- a/data_preprocess/rule_base_replacement/1.add_tag_name.ipynb +++ /dev/null @@ -1,133 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Changes made in ships_idx 1000: 251\n", - "Changes made in ships_idx 1001: 54\n", - "Changes made in ships_idx 1002: 46\n", - "Changes made in ships_idx 1003: 162\n", - "Changes made in ships_idx 1004: 8\n", - "Changes made in ships_idx 1005: 18\n", - "Changes made in ships_idx 1008: 22\n", - "Changes made in ships_idx 1009: 5\n", - "Changes made in ships_idx 1010: 131\n", - "Changes made in ships_idx 1011: 46\n", - "Changes made in ships_idx 1012: 2\n", - "Changes made in ships_idx 1013: 130\n", - "Changes made in ships_idx 1014: 46\n", - "Changes made in ships_idx 1015: 145\n", - "Changes made in ships_idx 1016: 191\n", - "Changes made in ships_idx 1017: 111\n", - "Changes made in ships_idx 1018: 680\n", - "Changes made in ships_idx 1019: 2\n", - "Changes made in ships_idx 1020: 10\n", - "Changes made in ships_idx 1021: 2\n", - "Changes made in ships_idx 1022: 7\n", - "Changes made in ships_idx 1023: 7\n", - "Changes made in ships_idx 1024: 136\n", - "Changes made in ships_idx 1025: 10\n", - "Changes made in ships_idx 1026: 6\n", - "Changes made in ships_idx 1027: 6\n", - "Changes made in ships_idx 1028: 6\n", - "Changes made in ships_idx 1029: 132\n", - "Changes made in ships_idx 1030: 86\n", - "Changes made in ships_idx 1031: 55\n", - "Changes made in ships_idx 1032: 225\n", - "Changes made in ships_idx 1033: 147\n", - "Changes made in ships_idx 1035: 132\n", - "Changes made in ships_idx 1036: 5\n", - "Changes made in ships_idx 1037: 3\n", - "Changes made in ships_idx 1038: 6\n", - "Changes made in ships_idx 1039: 232\n", - "Changes made in ships_idx 1042: 20\n", - "Changes made in ships_idx 1043: 154\n", - "Changes made in ships_idx 1044: 117\n", - "Changes made in ships_idx 1045: 243\n", - "Changes made in ships_idx 1046: 6\n", - "Changes made in ships_idx 1047: 12\n", - "Changes made in ships_idx 1048: 82\n", - "Changes made in ships_idx 1049: 912\n", - "Changes made in ships_idx 1050: 46\n", - "Changes made in ships_idx 1051: 57\n", - "Total number of changes made: 4912\n", - "Updated data saved to raw_data_add_tag.csv\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "# Load the preprocessed data CSV file\n", - "file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n", - "data = pd.read_csv(file_path, dtype=str)\n", - "\n", - "# Initialize a counter for the total number of changes\n", - "total_changes = 0\n", - "\n", - "# Initialize a dictionary to count changes per ships_idx\n", - "ships_idx_changes = {}\n", - "\n", - "# Process each group by ships_idx\n", - "for ships_idx, group in data.groupby('ships_idx'):\n", - " # Find duplicated tag_descriptions within the group\n", - " duplicated_descriptions = group['tag_description'].duplicated(keep=False)\n", - " \n", - " # Count how many tag_descriptions are duplicated within this ships_idx\n", - " num_changes = duplicated_descriptions.sum()\n", - "\n", - " # If there are any duplicates\n", - " if num_changes > 0:\n", - " # Increment the total changes count\n", - " total_changes += num_changes\n", - " \n", - " # Record the number of changes for this ships_idx\n", - " ships_idx_changes[ships_idx] = num_changes\n", - "\n", - " # Apply the concatenation of tag_name to tag_description for duplicates\n", - " data.loc[duplicated_descriptions & (data['ships_idx'] == ships_idx), 'tag_description'] = \\\n", - " data['tag_name'] + ' ' + data['tag_description']\n", - "\n", - "# Output the changes per ships_idx\n", - "for ships_idx, count in ships_idx_changes.items():\n", - " print(f\"Changes made in ships_idx {ships_idx}: {count}\")\n", - "\n", - "# Output the total number of changes\n", - "print(f\"Total number of changes made: {total_changes}\")\n", - "\n", - "# Optionally, save the updated DataFrame back to a CSV\n", - "output_file_path = 'raw_data_add_tag.csv'\n", - "data.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", - "\n", - "print(f\"Updated data saved to {output_file_path}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "torch", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/data_preprocess/rule_base_replacement/2.seperate_number.ipynb b/data_preprocess/rule_base_replacement/2.seperate_number.ipynb deleted file mode 100644 index 325113c..0000000 --- a/data_preprocess/rule_base_replacement/2.seperate_number.ipynb +++ /dev/null @@ -1,100 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Updated data saved to raw_data_s.csv\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import re\n", - "\n", - "# Load the data_mapping CSV file\n", - "data_mapping_file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n", - "data_mapping_file_path = 'raw_data_add_tag.csv' # Adjust this path to your actual file location\n", - "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n", - "\n", - "# Backup the original tag_description\n", - "data_mapping['org_tag_description'] = data_mapping['tag_description']\n", - "\n", - "# Ensure all values in the 'tag_description' column are strings\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n", - "\n", - "# Function to find tokens containing numbers\n", - "def find_tokens_with_numbers(description):\n", - " tokens = description.split() # Tokenize by spaces\n", - " number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n", - " return number_tokens\n", - "\n", - "# Function to process tokens\n", - "def process_token(token):\n", - " # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n", - " token = re.sub(r'(_|-)(?=\\d)', ' ', token)\n", - " token = re.sub(r'(?<=\\d)(_|-)', ' ', token)\n", - "\n", - " # Step 2: Insert spaces between letters and numbers where no separator exists\n", - " token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n", - " token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n", - "\n", - " # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n", - " token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n", - "\n", - " # Clean multiple spaces and strip\n", - " token = re.sub(r'\\s+', ' ', token).strip()\n", - " return token\n", - "\n", - "# Apply the process to each row in the 'tag_description' column\n", - "for index, row in data_mapping.iterrows():\n", - " original_description = row['tag_description']\n", - " number_tokens = find_tokens_with_numbers(original_description)\n", - "\n", - " # Process each token containing numbers\n", - " processed_tokens = [process_token(token) for token in number_tokens]\n", - "\n", - " # Replace the original tokens with processed tokens in the tag_description\n", - " new_description = original_description\n", - " for original_token, processed_token in zip(number_tokens, processed_tokens):\n", - " new_description = new_description.replace(original_token, processed_token)\n", - "\n", - " # Update the data_mapping with the modified description\n", - " data_mapping.at[index, 'tag_description'] = new_description\n", - "\n", - "# Save the updated data_mapping to a new CSV file\n", - "output_file_path = 'raw_data_s.csv'\n", - "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", - "\n", - "print(f\"Updated data saved to {output_file_path}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "torch", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/data_preprocess/rule_base_replacement/3.replacement.ipynb b/data_preprocess/rule_base_replacement/3.replacement.ipynb deleted file mode 100644 index 8aa43bf..0000000 --- a/data_preprocess/rule_base_replacement/3.replacement.ipynb +++ /dev/null @@ -1,123 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Updated data saved to ../preprocessed_data.csv\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import re\n", - "\n", - "# Load the data_mapping CSV file\n", - "data_mapping_file_path = 'raw_data_s.csv' # Adjust this path to your actual file location\n", - "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n", - " \n", - " # Ensure all values in the 'tag_description' column are strings\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n", - "\n", - "# Initial replacement mapping\n", - "initial_replacements = {\n", - " \"MGE\": \"G/E\",\n", - " \"GEN.\": \"G/E\",\n", - " \"GEN\": \"G/E\",\n", - " \"GE\": \"G/E\",\n", - " \"G_E\": \"G/E\",\n", - " \"ME\": \"M/E\",\n", - " \"M_E\": \"M/E\",\n", - " \"S_G\": \"S/G\",\n", - " \"T_C\": \"T/C\",\n", - " \"TC\": \"T/C\",\n", - " \"L_O\": \"L.O\",\n", - " \"LO\": \"L.O\",\n", - " \"F_O\": \"F.O\",\n", - " \"FO\": \"F.O\",\n", - " \"D_G\": \"D/G\",\n", - " \"DG\": \"D/G\",\n", - " \"PP\": \"P/P\"\n", - "}\n", - "\n", - "# Second replacement mapping\n", - "second_replacements = {\n", - " \"_G/E\": \" G/E\",\n", - " \"G/E_\": \"G/E \",\n", - " \"_M/E\": \" M/E\",\n", - " \"M/E_\": \"M/E \",\n", - " \"_S/G\": \" S/G\",\n", - " \"S/G_\": \"S/G \",\n", - " \"_T/C\": \" T/C\",\n", - " \"T/C_\": \"T/C \",\n", - " \"_L.O\": \" L.O\",\n", - " \"L.O_\": \"L.O \",\n", - " \"_F.O\": \" F.O\",\n", - " \"F.O_\": \"F.O \",\n", - " \"_D/G\": \" D/G\",\n", - " \"D/G_\": \"D/G \",\n", - " \"DG_\": \"DG \"\n", - "}\n", - "\n", - "# Function to separate numbers from text in a token\n", - "def separate_numbers_from_text(description):\n", - " # This regex pattern finds occurrences where text is followed by numbers or vice versa\n", - " return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n", - "\n", - "# Function to perform replacements using tokens\n", - "def replace_tokens(description, replacements):\n", - " tokens = description.split() # Tokenize by spaces\n", - " tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary\n", - " return ' '.join(tokens)\n", - "\n", - "# Function to perform replacements for substrings\n", - "def replace_substrings(description, replacements):\n", - " for old, new in replacements.items():\n", - " description = description.replace(old, new)\n", - " return description\n", - "\n", - "# Separate numbers from text before applying replacements\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n", - "\n", - "# Apply initial replacements\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n", - "\n", - "# Apply second replacements as substrings\n", - "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n", - "\n", - "# Save the updated data_mapping to a new CSV file\n", - "output_file_path = '../preprocessed_data.csv'\n", - "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", - "\n", - "print(f\"Updated data saved to {output_file_path}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "torch", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/data_preprocess/rule_base_replacement/README.md b/data_preprocess/rule_base_replacement/README.md new file mode 100644 index 0000000..ebb0f65 --- /dev/null +++ b/data_preprocess/rule_base_replacement/README.md @@ -0,0 +1,9 @@ +## Purpose: + +Here is a collection of pre-processing methods used in the GRS paper. + +## Instructions: + +- `python add_tag_name.py` +- `python separate_number.py` +- `python replacement.py` \ No newline at end of file diff --git a/data_preprocess/rule_base_replacement/add_tag_name.py b/data_preprocess/rule_base_replacement/add_tag_name.py new file mode 100644 index 0000000..ed45368 --- /dev/null +++ b/data_preprocess/rule_base_replacement/add_tag_name.py @@ -0,0 +1,63 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.4 +# kernelspec: +# display_name: torch +# language: python +# name: python3 +# --- + +# %% +import pandas as pd + +# Load the preprocessed data CSV file +file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location +data = pd.read_csv(file_path, dtype=str) + +# Initialize a counter for the total number of changes +total_changes = 0 + +# Initialize a dictionary to count changes per ships_idx +ships_idx_changes = {} + +# Process each group by ships_idx +for ships_idx, group in data.groupby('ships_idx'): + # Find duplicated tag_descriptions within the group + duplicated_descriptions = group['tag_description'].duplicated(keep=False) + + # Count how many tag_descriptions are duplicated within this ships_idx + num_changes = duplicated_descriptions.sum() + + # If there are any duplicates + if num_changes > 0: + # Increment the total changes count + total_changes += num_changes + + # Record the number of changes for this ships_idx + ships_idx_changes[ships_idx] = num_changes + + # Apply the concatenation of tag_name to tag_description for duplicates + data.loc[duplicated_descriptions & (data['ships_idx'] == ships_idx), 'tag_description'] = \ + data['tag_name'] + ' ' + data['tag_description'] + +# Output the changes per ships_idx +for ships_idx, count in ships_idx_changes.items(): + print(f"Changes made in ships_idx {ships_idx}: {count}") + +# Output the total number of changes +print(f"Total number of changes made: {total_changes}") + +# Optionally, save the updated DataFrame back to a CSV +output_file_path = 'outputs/raw_data_add_tag.csv' +data.to_csv(output_file_path, index=False, encoding='utf-8-sig') + +print(f"Updated data saved to {output_file_path}") + + +# %% diff --git a/data_preprocess/rule_base_replacement/outputs/.gitignore b/data_preprocess/rule_base_replacement/outputs/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/data_preprocess/rule_base_replacement/outputs/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/data_preprocess/rule_base_replacement/replacement.py b/data_preprocess/rule_base_replacement/replacement.py new file mode 100644 index 0000000..25adb27 --- /dev/null +++ b/data_preprocess/rule_base_replacement/replacement.py @@ -0,0 +1,103 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.4 +# kernelspec: +# display_name: torch +# language: python +# name: python3 +# --- + +# %% +import pandas as pd +import re + +# Load the data_mapping CSV file + + +data_mapping_file_path = 'outputs/raw_data_s.csv' # Adjust this path to your actual file location +data_mapping = pd.read_csv(data_mapping_file_path, dtype=str) + + # Ensure all values in the 'tag_description' column are strings +data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str) +data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True) + +# Initial replacement mapping +initial_replacements = { + "MGE": "G/E", + "GEN.": "G/E", + "GEN": "G/E", + "GE": "G/E", + "G_E": "G/E", + "ME": "M/E", + "M_E": "M/E", + "S_G": "S/G", + "T_C": "T/C", + "TC": "T/C", + "L_O": "L.O", + "LO": "L.O", + "F_O": "F.O", + "FO": "F.O", + "D_G": "D/G", + "DG": "D/G", + "PP": "P/P" +} + +# Second replacement mapping +second_replacements = { + "_G/E": " G/E", + "G/E_": "G/E ", + "_M/E": " M/E", + "M/E_": "M/E ", + "_S/G": " S/G", + "S/G_": "S/G ", + "_T/C": " T/C", + "T/C_": "T/C ", + "_L.O": " L.O", + "L.O_": "L.O ", + "_F.O": " F.O", + "F.O_": "F.O ", + "_D/G": " D/G", + "D/G_": "D/G ", + "DG_": "DG " +} + +# Function to separate numbers from text in a token +def separate_numbers_from_text(description): + # This regex pattern finds occurrences where text is followed by numbers or vice versa + return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description)) + +# Function to perform replacements using tokens +def replace_tokens(description, replacements): + tokens = description.split() # Tokenize by spaces + tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary + return ' '.join(tokens) + +# Function to perform replacements for substrings +def replace_substrings(description, replacements): + for old, new in replacements.items(): + description = description.replace(old, new) + return description + +# Separate numbers from text before applying replacements +data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text) + +# Apply initial replacements +data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements) + +# Apply second replacements as substrings +data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements) + +# Save the updated data_mapping to a new CSV file +output_file_path = '../exports/preprocessed_data.csv' +data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig') + +print(f"Updated data saved to {output_file_path}") + + +# %% diff --git a/data_preprocess/rule_base_replacement/separate_number.py b/data_preprocess/rule_base_replacement/separate_number.py new file mode 100644 index 0000000..b76a45f --- /dev/null +++ b/data_preprocess/rule_base_replacement/separate_number.py @@ -0,0 +1,78 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.4 +# kernelspec: +# display_name: torch +# language: python +# name: python3 +# --- + +# %% +import pandas as pd +import re + +# Load the data_mapping CSV file +data_mapping_file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location +data_mapping_file_path = 'outputs/raw_data_add_tag.csv' # Adjust this path to your actual file location +data_mapping = pd.read_csv(data_mapping_file_path, dtype=str) + +# Backup the original tag_description +data_mapping['org_tag_description'] = data_mapping['tag_description'] + +# Ensure all values in the 'tag_description' column are strings +data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str) +data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True) + +# Function to find tokens containing numbers +def find_tokens_with_numbers(description): + tokens = description.split() # Tokenize by spaces + number_tokens = [token for token in tokens if re.search(r'\d', token)] + return number_tokens + +# Function to process tokens +def process_token(token): + # Step 1: Replace '_' or '-' adjacent to numbers with spaces + token = re.sub(r'(_|-)(?=\d)', ' ', token) + token = re.sub(r'(?<=\d)(_|-)', ' ', token) + + # Step 2: Insert spaces between letters and numbers where no separator exists + token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token) + token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token) + + # Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1" + token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token) + + # Clean multiple spaces and strip + token = re.sub(r'\s+', ' ', token).strip() + return token + +# Apply the process to each row in the 'tag_description' column +for index, row in data_mapping.iterrows(): + original_description = row['tag_description'] + number_tokens = find_tokens_with_numbers(original_description) + + # Process each token containing numbers + processed_tokens = [process_token(token) for token in number_tokens] + + # Replace the original tokens with processed tokens in the tag_description + new_description = original_description + for original_token, processed_token in zip(number_tokens, processed_tokens): + new_description = new_description.replace(original_token, processed_token) + + # Update the data_mapping with the modified description + data_mapping.at[index, 'tag_description'] = new_description + +# Save the updated data_mapping to a new CSV file +output_file_path = 'outputs/raw_data_s.csv' +data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig') + +print(f"Updated data saved to {output_file_path}") + + +# %% diff --git a/data_preprocess/split_data.ipynb b/data_preprocess/split_data.ipynb deleted file mode 100644 index 4957656..0000000 --- a/data_preprocess/split_data.ipynb +++ /dev/null @@ -1,441 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Final Group Allocation:\n", - "Group 1: Ships_idx = [1025, 1032, 1042, 1046, 1023, 1037, 1024, 1014, 1019, 1008], PD type = 529, PD = 1992, SD = 9855\n", - "Group 2: Ships_idx = [1003, 1028, 1018, 1020, 1033, 1050, 1030, 1051, 1004, 1036], PD type = 528, PD = 2113, SD = 13074\n", - "Group 3: Ships_idx = [1016, 1026, 1043, 1031, 1012, 1021, 1000, 1011, 1006, 1005, 1038], PD type = 521, PD = 2140, SD = 10722\n", - "Group 4: Ships_idx = [1047, 1049, 1010, 1027, 1013, 1022, 1048, 1017, 1045, 1007], PD type = 521, PD = 2102, SD = 15451\n", - "Group 5: Ships_idx = [1039, 1035, 1044, 1009, 1015, 1040, 1001, 1034, 1041, 1002, 1029], PD type = 500, PD = 2183, SD = 12969\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from collections import defaultdict\n", - "\n", - "# Function to calculate the number of unique combinations and total count for each ship\n", - "def calculate_ship_count(group):\n", - " ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index()\n", - " ship_count.columns = ['ships_idx', 'comb_count', 'total_count']\n", - " return ship_count\n", - "\n", - "# Function to calculate the combination count and total count for a group\n", - "def calculate_group_count(group):\n", - " comb_count = group['thing_property'].nunique()\n", - " total_count = group['thing_property'].size\n", - " return comb_count, total_count\n", - "\n", - "# Function to calculate the increase in combination count when a ship is added to a group\n", - "def calculate_comb_count_increase(groups, g, ship_idx, mdm):\n", - " temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n", - " temp_groups[g].append(ship_idx)\n", - " \n", - " group_ships = temp_groups[g]\n", - " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", - " \n", - " new_comb_count, _ = calculate_group_count(group_data)\n", - " \n", - " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", - " current_comb_count, _ = calculate_group_count(current_group_data)\n", - " \n", - " increase = new_comb_count - current_comb_count\n", - " \n", - " return increase\n", - "\n", - "# Function to calculate the increase in total count when a ship is added to a group\n", - "def calculate_total_count_increase(groups, g, ship_idx, mdm):\n", - " temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()})\n", - " temp_groups[g].append(ship_idx)\n", - " \n", - " group_ships = temp_groups[g]\n", - " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", - " \n", - " _, new_total_count = calculate_group_count(group_data)\n", - " \n", - " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", - " _, current_total_count = calculate_group_count(current_group_data)\n", - " \n", - " increase = new_total_count - current_total_count\n", - " \n", - " return increase\n", - "\n", - "# Function to find the ship that will bring the total count closest to the target\n", - "def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count):\n", - " total_count_differences = []\n", - "\n", - " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", - " _, current_total_count = calculate_group_count(current_group_data)\n", - "\n", - " for ship_idx in remaining_ships:\n", - " increase = calculate_total_count_increase(groups, g, ship_idx, mdm)\n", - " new_total_count = current_total_count + increase\n", - " difference = abs(target_total_count - new_total_count)\n", - " total_count_differences.append((ship_idx, difference, increase))\n", - "\n", - " if not total_count_differences:\n", - " return None, 0\n", - " \n", - " closest_ship = min(total_count_differences, key=lambda x: x[1])\n", - " selected_ship_idx, _, selected_increase = closest_ship\n", - "\n", - " return selected_ship_idx, selected_increase\n", - "\n", - "# Function to find the ship that gives the maximum increase in combination count\n", - "def find_max_increase_ship(groups, g, remaining_ships, mdm):\n", - " comb_count_increase = []\n", - "\n", - " for ship_idx in remaining_ships:\n", - " increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n", - " comb_count_increase.append((ship_idx, increase))\n", - "\n", - " max_increase_ship = max(comb_count_increase, key=lambda x: x[1])\n", - " selected_ship_idx, max_increase = max_increase_ship\n", - " \n", - " return selected_ship_idx, max_increase\n", - "\n", - "# Function to find the ship that will bring the combination count closest to the target\n", - "def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count):\n", - " comb_count_differences = []\n", - "\n", - " current_group_data = mdm[mdm['ships_idx'].isin(groups[g])]\n", - " current_comb_count, _ = calculate_group_count(current_group_data)\n", - "\n", - " for ship_idx in remaining_ships:\n", - " increase = calculate_comb_count_increase(groups, g, ship_idx, mdm)\n", - " new_comb_count = current_comb_count + increase\n", - " difference = abs(target_comb_count - new_comb_count)\n", - " comb_count_differences.append((ship_idx, difference, increase))\n", - "\n", - " if not comb_count_differences:\n", - " return None, 0\n", - "\n", - " closest_ship = min(comb_count_differences, key=lambda x: x[1])\n", - " selected_ship_idx, _, selected_increase = closest_ship\n", - "\n", - " return selected_ship_idx, selected_increase\n", - "\n", - "# Function to find the group with the maximum combination count\n", - "def find_group_with_max_comb_count(groups, mdm):\n", - " max_comb_count = -1\n", - " max_group_idx = -1\n", - "\n", - " for g in range(len(groups)):\n", - " group_ships = groups[g]\n", - " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", - " comb_count, _ = calculate_group_count(group_data)\n", - " \n", - " if comb_count > max_comb_count:\n", - " max_comb_count = comb_count\n", - " max_group_idx = g\n", - "\n", - " return max_group_idx, max_comb_count\n", - "\n", - "# Function to find the group with the maximum total count\n", - "def find_group_with_max_total_count(groups, mdm):\n", - " max_total_count = -1\n", - " max_group_idx = -1\n", - "\n", - " for g in range(len(groups)):\n", - " group_ships = groups[g]\n", - " group_data = mdm[mdm['ships_idx'].isin(group_ships)]\n", - " _, total_count = calculate_group_count(group_data)\n", - " \n", - " if total_count > max_total_count:\n", - " max_total_count = total_count\n", - " max_group_idx = g\n", - "\n", - " return max_group_idx, max_total_count\n", - "\n", - "import pandas as pd\n", - "from collections import defaultdict\n", - "\n", - "# Load the CSV file\n", - "data_file_path = 'preprocessed_data.csv'\n", - "data = pd.read_csv(data_file_path)\n", - "\n", - "# Filter the data where MDM is True\n", - "mdm_true = data[data['MDM'] == True].copy() # .copy()를 사용하여 명시적으로 복사본 생성\n", - "mdm_all = data.copy()\n", - "\n", - "# Create a new column combining 'thing' and 'property'\n", - "mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property']\n", - "mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property']\n", - "\n", - "# Initial setup for groups\n", - "ship_count = calculate_ship_count(mdm_true)\n", - "num_groups = 5\n", - "groups = defaultdict(list)\n", - "\n", - "# Sort ships by combination count in descending order\n", - "sorted_ships = ship_count.sort_values(by='comb_count', ascending=False)\n", - "\n", - "# Assign the first 5 ships to the groups\n", - "for i in range(num_groups):\n", - " groups[i].append(sorted_ships.iloc[i]['ships_idx'])\n", - "\n", - "remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values\n", - "\n", - "# Allocate remaining ships to the groups\n", - "while len(remaining_ships) > 0:\n", - " group_comb_counts = []\n", - " for g in range(num_groups):\n", - " group_ships = groups[g]\n", - " group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", - " comb_count, _ = calculate_group_count(group_data)\n", - " group_comb_counts.append((g, comb_count))\n", - "\n", - " group_comb_counts.sort(key=lambda x: x[1])\n", - " \n", - " remaining_group = []\n", - " for g, _ in group_comb_counts:\n", - " if len(remaining_ships) == 0:\n", - " break\n", - " \n", - " if group_comb_counts.index((g, _)) == 0:\n", - " selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true)\n", - " \n", - " else:\n", - " max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true)\n", - " selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count)\n", - "\n", - " if comb_increase == 0:\n", - " remaining_group.append(g)\n", - " else:\n", - " groups[g].append(selected_ship_idx)\n", - " remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n", - "\n", - " for g in remaining_group:\n", - " if len(remaining_ships) == 0:\n", - " break\n", - " max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true)\n", - " selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count)\n", - " if selected_ship_idx is not None:\n", - " groups[g].append(selected_ship_idx)\n", - " remaining_ships = remaining_ships[remaining_ships != selected_ship_idx]\n", - "\n", - "# Calculate comb_count for each group and store it in a list\n", - "group_comb_counts = []\n", - "for g in range(num_groups):\n", - " group_ships = groups[g]\n", - " group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", - " comb_count, total_count = calculate_group_count(group_data_true)\n", - "\n", - " # Calculate total count including MDM=False\n", - " group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n", - " _, total_count_all = calculate_group_count(group_data_all)\n", - " \n", - " group_comb_counts.append((g, comb_count, total_count_all))\n", - "\n", - "# Sort the groups by comb_count in descending order\n", - "group_comb_counts.sort(key=lambda x: x[1], reverse=True)\n", - "\n", - "# Reorder the groups dictionary based on the sorted order\n", - "sorted_groups = defaultdict(list)\n", - "for i, (g, _, _) in enumerate(group_comb_counts):\n", - " sorted_groups[i] = groups[g]\n", - "\n", - "# Final output of group allocation\n", - "print(\"Final Group Allocation:\")\n", - "for g in range(num_groups):\n", - " group_ships = sorted_groups[g]\n", - " group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", - " comb_count, total_count = calculate_group_count(group_data_true)\n", - "\n", - " # Calculate total count including MDM=False\n", - " group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n", - " _, total_count_all = calculate_group_count(group_data_all)\n", - "\n", - " print(f\"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CSV file has been generated: 'combined_group_allocation.csv'\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import GroupKFold\n", - "\n", - "# Prepare data for custom group allocation (BGKF)\n", - "comb_counts = []\n", - "total_counts = []\n", - "ship_counts = []\n", - "custom_results = []\n", - "\n", - "for g in range(num_groups):\n", - " group_ships = groups[g]\n", - " group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)]\n", - " comb_count, total_count = calculate_group_count(group_data_true)\n", - " \n", - " # Calculate total count including MDM=False\n", - " group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)]\n", - " _, total_count_all = calculate_group_count(group_data_all)\n", - " \n", - " custom_results.append({\n", - " 'Group': g + 1,\n", - " 'Allocation': 'BGKF',\n", - " 'Comb_count': comb_count,\n", - " 'Total_count': total_count,\n", - " 'Total_count_all': total_count_all,\n", - " 'Ship_count': len(group_ships),\n", - " 'Ships_idx': list(group_ships)\n", - " })\n", - "\n", - "# Sort the custom group allocation by comb_count in descending order\n", - "custom_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n", - "\n", - "# Adjust group numbers after sorting\n", - "for i, result in enumerate(custom_results):\n", - " result['Group'] = i + 1\n", - "\n", - "# Prepare data for GroupKFold allocation (GKF)\n", - "gkf = GroupKFold(n_splits=5)\n", - "gkf_results = []\n", - "\n", - "for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])):\n", - " test_group = mdm_true.iloc[test_idx]\n", - " comb_count, total_count = calculate_group_count(test_group)\n", - " \n", - " # Calculate total count including MDM=False\n", - " test_group_ships = test_group['ships_idx'].unique()\n", - " test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)]\n", - " _, total_count_all = calculate_group_count(test_group_all)\n", - " \n", - " gkf_results.append({\n", - " 'Group': i + 1,\n", - " 'Allocation': 'GKF',\n", - " 'Comb_count': comb_count,\n", - " 'Total_count': total_count,\n", - " 'Total_count_all': total_count_all,\n", - " 'Ship_count': test_group['ships_idx'].nunique(),\n", - " 'Ships_idx': list(test_group['ships_idx'].unique())\n", - " })\n", - "\n", - "# Sort the GKF allocation by comb_count in descending order\n", - "gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True)\n", - "\n", - "# Adjust group numbers after sorting\n", - "for i, result in enumerate(gkf_results):\n", - " result['Group'] = i + 1\n", - "\n", - "# Combine BGKF and GKF results into one DataFrame\n", - "combined_results = custom_results + gkf_results\n", - "combined_df = pd.DataFrame(combined_results)\n", - "\n", - "# Output the combined results to a single CSV file\n", - "combined_df.to_csv('combined_group_allocation.csv', index=False)\n", - "\n", - "print(\"CSV file has been generated: 'combined_group_allocation.csv'\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Group 1 datasets saved in dataset/1\n", - "Group 2 datasets saved in dataset/2\n", - "Group 3 datasets saved in dataset/3\n", - "Group 4 datasets saved in dataset/4\n", - "Group 5 datasets saved in dataset/5\n" - ] - } - ], - "source": [ - "import os\n", - "import pandas as pd\n", - "from sklearn.model_selection import KFold\n", - "\n", - "def save_datasets_for_group(groups, mdm, data, output_dir='dataset', n_splits=4):\n", - " for i in range(len(groups)):\n", - " group_folder = os.path.join(output_dir, str(i + 1))\n", - " os.makedirs(group_folder, exist_ok=True)\n", - " \n", - " # Create the test dataset by including only group i\n", - " test_group_ships = groups[i]\n", - " test_data = mdm[mdm['ships_idx'].isin(test_group_ships)]\n", - " \n", - " # Extract corresponding entries from the external test dataset\n", - " test_all_data = data[data['ships_idx'].isin(test_group_ships)]\n", - " \n", - " # Create the train dataset by excluding group i\n", - " train_group_ships = []\n", - " for g in range(len(groups)):\n", - " if g != i:\n", - " train_group_ships.extend(groups[g])\n", - " train_data = mdm[mdm['ships_idx'].isin(train_group_ships)]\n", - " \n", - " # Use KFold to split train_data into train and valid datasets\n", - " kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42)\n", - " train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data))\n", - " \n", - " final_train_data = train_data.iloc[train_idx_inner]\n", - " valid_data = train_data.iloc[valid_idx_inner]\n", - " \n", - " # Combine train and valid data to create train_all\n", - " train_all_data = pd.concat([final_train_data, valid_data])\n", - " \n", - " # Save datasets to CSV files\n", - " train_file_path = os.path.join(group_folder, 'train.csv')\n", - " valid_file_path = os.path.join(group_folder, 'valid.csv')\n", - " test_file_path = os.path.join(group_folder, 'test.csv')\n", - " test_all_file_path = os.path.join(group_folder, 'test_all.csv')\n", - " train_all_file_path = os.path.join(group_folder, 'train_all.csv')\n", - " \n", - " final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig')\n", - " valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig')\n", - " # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n", - " test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig')\n", - " train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig')\n", - " \n", - " print(f\"Group {i + 1} datasets saved in {group_folder}\")\n", - "\n", - "# Example usage:\n", - "save_datasets_for_group(groups, mdm_true, data, n_splits=4)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "torch", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/data_preprocess/split_data.py b/data_preprocess/split_data.py new file mode 100644 index 0000000..b9aa141 --- /dev/null +++ b/data_preprocess/split_data.py @@ -0,0 +1,382 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.4 +# kernelspec: +# display_name: torch +# language: python +# name: python3 +# --- + +# %% +import pandas as pd +from collections import defaultdict + +# Function to calculate the number of unique combinations and total count for each ship +def calculate_ship_count(group): + ship_count = group.groupby('ships_idx')['thing_property'].agg(['nunique', 'size']).reset_index() + ship_count.columns = ['ships_idx', 'comb_count', 'total_count'] + return ship_count + +# Function to calculate the combination count and total count for a group +def calculate_group_count(group): + comb_count = group['thing_property'].nunique() + total_count = group['thing_property'].size + return comb_count, total_count + +# Function to calculate the increase in combination count when a ship is added to a group +def calculate_comb_count_increase(groups, g, ship_idx, mdm): + temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()}) + temp_groups[g].append(ship_idx) + + group_ships = temp_groups[g] + group_data = mdm[mdm['ships_idx'].isin(group_ships)] + + new_comb_count, _ = calculate_group_count(group_data) + + current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] + current_comb_count, _ = calculate_group_count(current_group_data) + + increase = new_comb_count - current_comb_count + + return increase + +# Function to calculate the increase in total count when a ship is added to a group +def calculate_total_count_increase(groups, g, ship_idx, mdm): + temp_groups = defaultdict(list, {k: v.copy() for k, v in groups.items()}) + temp_groups[g].append(ship_idx) + + group_ships = temp_groups[g] + group_data = mdm[mdm['ships_idx'].isin(group_ships)] + + _, new_total_count = calculate_group_count(group_data) + + current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] + _, current_total_count = calculate_group_count(current_group_data) + + increase = new_total_count - current_total_count + + return increase + +# Function to find the ship that will bring the total count closest to the target +def find_closest_total_count_ship(groups, g, remaining_ships, mdm, target_total_count): + total_count_differences = [] + + current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] + _, current_total_count = calculate_group_count(current_group_data) + + for ship_idx in remaining_ships: + increase = calculate_total_count_increase(groups, g, ship_idx, mdm) + new_total_count = current_total_count + increase + difference = abs(target_total_count - new_total_count) + total_count_differences.append((ship_idx, difference, increase)) + + if not total_count_differences: + return None, 0 + + closest_ship = min(total_count_differences, key=lambda x: x[1]) + selected_ship_idx, _, selected_increase = closest_ship + + return selected_ship_idx, selected_increase + +# Function to find the ship that gives the maximum increase in combination count +def find_max_increase_ship(groups, g, remaining_ships, mdm): + comb_count_increase = [] + + for ship_idx in remaining_ships: + increase = calculate_comb_count_increase(groups, g, ship_idx, mdm) + comb_count_increase.append((ship_idx, increase)) + + max_increase_ship = max(comb_count_increase, key=lambda x: x[1]) + selected_ship_idx, max_increase = max_increase_ship + + return selected_ship_idx, max_increase + +# Function to find the ship that will bring the combination count closest to the target +def find_closest_comb_count_ship(groups, g, remaining_ships, mdm, target_comb_count): + comb_count_differences = [] + + current_group_data = mdm[mdm['ships_idx'].isin(groups[g])] + current_comb_count, _ = calculate_group_count(current_group_data) + + for ship_idx in remaining_ships: + increase = calculate_comb_count_increase(groups, g, ship_idx, mdm) + new_comb_count = current_comb_count + increase + difference = abs(target_comb_count - new_comb_count) + comb_count_differences.append((ship_idx, difference, increase)) + + if not comb_count_differences: + return None, 0 + + closest_ship = min(comb_count_differences, key=lambda x: x[1]) + selected_ship_idx, _, selected_increase = closest_ship + + return selected_ship_idx, selected_increase + +# Function to find the group with the maximum combination count +def find_group_with_max_comb_count(groups, mdm): + max_comb_count = -1 + max_group_idx = -1 + + for g in range(len(groups)): + group_ships = groups[g] + group_data = mdm[mdm['ships_idx'].isin(group_ships)] + comb_count, _ = calculate_group_count(group_data) + + if comb_count > max_comb_count: + max_comb_count = comb_count + max_group_idx = g + + return max_group_idx, max_comb_count + +# Function to find the group with the maximum total count +def find_group_with_max_total_count(groups, mdm): + max_total_count = -1 + max_group_idx = -1 + + for g in range(len(groups)): + group_ships = groups[g] + group_data = mdm[mdm['ships_idx'].isin(group_ships)] + _, total_count = calculate_group_count(group_data) + + if total_count > max_total_count: + max_total_count = total_count + max_group_idx = g + + return max_group_idx, max_total_count + +import pandas as pd +from collections import defaultdict + +# Load the CSV file +data_file_path = 'exports/preprocessed_data.csv' +data = pd.read_csv(data_file_path) + +# Filter the data where MDM is True +mdm_true = data[data['MDM'] == True].copy() # .copy()를 사용하여 명시적으로 복사본 생성 +mdm_all = data.copy() + +# Create a new column combining 'thing' and 'property' +mdm_true.loc[:, 'thing_property'] = mdm_true['thing'] + '_' + mdm_true['property'] +mdm_all.loc[:, 'thing_property'] = mdm_all['thing'] + '_' + mdm_all['property'] + +# Initial setup for groups +ship_count = calculate_ship_count(mdm_true) +num_groups = 5 +groups = defaultdict(list) + +# Sort ships by combination count in descending order +sorted_ships = ship_count.sort_values(by='comb_count', ascending=False) + +# Assign the first 5 ships to the groups +for i in range(num_groups): + groups[i].append(sorted_ships.iloc[i]['ships_idx']) + +remaining_ships = sorted_ships.iloc[num_groups:]['ships_idx'].values + +# Allocate remaining ships to the groups +while len(remaining_ships) > 0: + group_comb_counts = [] + for g in range(num_groups): + group_ships = groups[g] + group_data = mdm_true[mdm_true['ships_idx'].isin(group_ships)] + comb_count, _ = calculate_group_count(group_data) + group_comb_counts.append((g, comb_count)) + + group_comb_counts.sort(key=lambda x: x[1]) + + remaining_group = [] + for g, _ in group_comb_counts: + if len(remaining_ships) == 0: + break + + if group_comb_counts.index((g, _)) == 0: + selected_ship_idx, comb_increase = find_max_increase_ship(groups, g, remaining_ships, mdm_true) + + else: + max_group_idx, max_comb_count = find_group_with_max_comb_count(groups, mdm_true) + selected_ship_idx, comb_increase = find_closest_comb_count_ship(groups, g, remaining_ships, mdm_true, max_comb_count) + + if comb_increase == 0: + remaining_group.append(g) + else: + groups[g].append(selected_ship_idx) + remaining_ships = remaining_ships[remaining_ships != selected_ship_idx] + + for g in remaining_group: + if len(remaining_ships) == 0: + break + max_group_idx, max_total_count = find_group_with_max_total_count(groups, mdm_true) + selected_ship_idx, count_increase = find_closest_total_count_ship(groups, g, remaining_ships, mdm_true, max_total_count) + if selected_ship_idx is not None: + groups[g].append(selected_ship_idx) + remaining_ships = remaining_ships[remaining_ships != selected_ship_idx] + +# Calculate comb_count for each group and store it in a list +group_comb_counts = [] +for g in range(num_groups): + group_ships = groups[g] + group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)] + comb_count, total_count = calculate_group_count(group_data_true) + + # Calculate total count including MDM=False + group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)] + _, total_count_all = calculate_group_count(group_data_all) + + group_comb_counts.append((g, comb_count, total_count_all)) + +# Sort the groups by comb_count in descending order +group_comb_counts.sort(key=lambda x: x[1], reverse=True) + +# Reorder the groups dictionary based on the sorted order +sorted_groups = defaultdict(list) +for i, (g, _, _) in enumerate(group_comb_counts): + sorted_groups[i] = groups[g] + +# Final output of group allocation +print("Final Group Allocation:") +for g in range(num_groups): + group_ships = sorted_groups[g] + group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)] + comb_count, total_count = calculate_group_count(group_data_true) + + # Calculate total count including MDM=False + group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)] + _, total_count_all = calculate_group_count(group_data_all) + + print(f"Group {g + 1}: Ships_idx = {group_ships}, PD type = {comb_count}, PD = {total_count}, SD = {total_count_all}") + + +# %% +import pandas as pd +from sklearn.model_selection import GroupKFold + +# Prepare data for custom group allocation (BGKF) +comb_counts = [] +total_counts = [] +ship_counts = [] +custom_results = [] + +for g in range(num_groups): + group_ships = groups[g] + group_data_true = mdm_true[mdm_true['ships_idx'].isin(group_ships)] + comb_count, total_count = calculate_group_count(group_data_true) + + # Calculate total count including MDM=False + group_data_all = mdm_all[mdm_all['ships_idx'].isin(group_ships)] + _, total_count_all = calculate_group_count(group_data_all) + + custom_results.append({ + 'Group': g + 1, + 'Allocation': 'BGKF', + 'Comb_count': comb_count, + 'Total_count': total_count, + 'Total_count_all': total_count_all, + 'Ship_count': len(group_ships), + 'Ships_idx': list(group_ships) + }) + +# Sort the custom group allocation by comb_count in descending order +custom_results.sort(key=lambda x: x['Comb_count'], reverse=True) + +# Adjust group numbers after sorting +for i, result in enumerate(custom_results): + result['Group'] = i + 1 + +# Prepare data for GroupKFold allocation (GKF) +gkf = GroupKFold(n_splits=5) +gkf_results = [] + +for i, (train_idx, test_idx) in enumerate(gkf.split(mdm_true, groups=mdm_true['ships_idx'])): + test_group = mdm_true.iloc[test_idx] + comb_count, total_count = calculate_group_count(test_group) + + # Calculate total count including MDM=False + test_group_ships = test_group['ships_idx'].unique() + test_group_all = mdm_all[mdm_all['ships_idx'].isin(test_group_ships)] + _, total_count_all = calculate_group_count(test_group_all) + + gkf_results.append({ + 'Group': i + 1, + 'Allocation': 'GKF', + 'Comb_count': comb_count, + 'Total_count': total_count, + 'Total_count_all': total_count_all, + 'Ship_count': test_group['ships_idx'].nunique(), + 'Ships_idx': list(test_group['ships_idx'].unique()) + }) + +# Sort the GKF allocation by comb_count in descending order +gkf_results.sort(key=lambda x: x['Comb_count'], reverse=True) + +# Adjust group numbers after sorting +for i, result in enumerate(gkf_results): + result['Group'] = i + 1 + +# Combine BGKF and GKF results into one DataFrame +combined_results = custom_results + gkf_results +combined_df = pd.DataFrame(combined_results) + +# Output the combined results to a single CSV file +combined_df.to_csv('exports/combined_group_allocation.csv', index=False) + +print("CSV file has been generated: 'combined_group_allocation.csv'") + + +# %% +import os +import pandas as pd +from sklearn.model_selection import KFold + +def save_datasets_for_group(groups, mdm, data, output_dir='exports/dataset', n_splits=4): + for i in range(len(groups)): + group_folder = os.path.join(output_dir, 'group' + '_' + str(i + 1)) + os.makedirs(group_folder, exist_ok=True) + + # Create the test dataset by including only group i + test_group_ships = groups[i] + test_data = mdm[mdm['ships_idx'].isin(test_group_ships)] + + # Extract corresponding entries from the external test dataset + test_all_data = data[data['ships_idx'].isin(test_group_ships)] + + # Create the train dataset by excluding group i + train_group_ships = [] + for g in range(len(groups)): + if g != i: + train_group_ships.extend(groups[g]) + train_data = mdm[mdm['ships_idx'].isin(train_group_ships)] + + # Use KFold to split train_data into train and valid datasets + kf_inner = KFold(n_splits=n_splits, shuffle=True, random_state=42) + train_idx_inner, valid_idx_inner = next(kf_inner.split(train_data)) + + final_train_data = train_data.iloc[train_idx_inner] + valid_data = train_data.iloc[valid_idx_inner] + + # Combine train and valid data to create train_all + train_all_data = pd.concat([final_train_data, valid_data]) + + # Save datasets to CSV files + train_file_path = os.path.join(group_folder, 'train.csv') + valid_file_path = os.path.join(group_folder, 'valid.csv') + test_file_path = os.path.join(group_folder, 'test.csv') + test_all_file_path = os.path.join(group_folder, 'test_all.csv') + train_all_file_path = os.path.join(group_folder, 'train_all.csv') + + final_train_data.to_csv(train_file_path, index=False, encoding='utf-8-sig') + valid_data.to_csv(valid_file_path, index=False, encoding='utf-8-sig') + # test_data.to_csv(test_file_path, index=False, encoding='utf-8-sig') + test_all_data.to_csv(test_file_path, index=False, encoding='utf-8-sig') + train_all_data.to_csv(train_all_file_path, index=False, encoding='utf-8-sig') + + print(f"Group {i + 1} datasets saved in {group_folder}") + +# Example usage: +save_datasets_for_group(groups, mdm_true, data, output_dir='exports/dataset', n_splits=4) +