hipom_data_mapping/data_preprocess/rule_base_replacement/separate_number.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.16.4
#   kernelspec:
#     display_name: torch
#     language: python
#     name: python3
# ---

# %%
import pandas as pd
import re

# Load the data_mapping CSV file
data_mapping_file_path = '../../data_import/exports/raw_data.csv'  # Adjust this path to your actual file location
data_mapping_file_path = 'outputs/raw_data_add_tag.csv'  # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)

# Backup the original tag_description
data_mapping['org_tag_description'] = data_mapping['tag_description']

# Ensure all values in the 'tag_description' column are strings
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)

# Function to find tokens containing numbers
def find_tokens_with_numbers(description):
    tokens = description.split()  # Tokenize by spaces
    number_tokens = [token for token in tokens if re.search(r'\d', token)]
    return number_tokens

# Function to process tokens
def process_token(token):
    # Step 1: Replace '_' or '-' adjacent to numbers with spaces
    token = re.sub(r'(_|-)(?=\d)', ' ', token)
    token = re.sub(r'(?<=\d)(_|-)', ' ', token)

    # Step 2: Insert spaces between letters and numbers where no separator exists
    token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token)
    token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token)

    # Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1"
    token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token)

    # Clean multiple spaces and strip
    token = re.sub(r'\s+', ' ', token).strip()
    return token

# Apply the process to each row in the 'tag_description' column
for index, row in data_mapping.iterrows():
    original_description = row['tag_description']
    number_tokens = find_tokens_with_numbers(original_description)

    # Process each token containing numbers
    processed_tokens = [process_token(token) for token in number_tokens]

    # Replace the original tokens with processed tokens in the tag_description
    new_description = original_description
    for original_token, processed_token in zip(number_tokens, processed_tokens):
        new_description = new_description.replace(original_token, processed_token)

    # Update the data_mapping with the modified description
    data_mapping.at[index, 'tag_description'] = new_description

# Save the updated data_mapping to a new CSV file
output_file_path = 'outputs/raw_data_s.csv'
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f"Updated data saved to {output_file_path}")


# %%
Chore: changed ipynb to py files in the data_preprocess folder Doc: added descriptions and instructions for the data_preprocess folder 2024-10-29 22:55:22 +09:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
			`# jupytext_version: 1.16.4`
			`# kernelspec:`
			`# display_name: torch`
			`# language: python`
			`# name: python3`
			`# ---`

			`# %%`
			`import pandas as pd`
			`import re`

			`# Load the data_mapping CSV file`
			`data_mapping_file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location`
			`data_mapping_file_path = 'outputs/raw_data_add_tag.csv' # Adjust this path to your actual file location`
			`data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)`

			`# Backup the original tag_description`
			`data_mapping['org_tag_description'] = data_mapping['tag_description']`

			`# Ensure all values in the 'tag_description' column are strings`
			`data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)`
			`data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)`

			`# Function to find tokens containing numbers`
			`def find_tokens_with_numbers(description):`
			`tokens = description.split() # Tokenize by spaces`
			`number_tokens = [token for token in tokens if re.search(r'\d', token)]`
			`return number_tokens`

			`# Function to process tokens`
			`def process_token(token):`
			`# Step 1: Replace '_' or '-' adjacent to numbers with spaces`
			`token = re.sub(r'(_\|-)(?=\d)', ' ', token)`
			`token = re.sub(r'(?<=\d)(_\|-)', ' ', token)`

			`# Step 2: Insert spaces between letters and numbers where no separator exists`
			`token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token)`
			`token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token)`

			`# Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1"`
			`token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token)`

			`# Clean multiple spaces and strip`
			`token = re.sub(r'\s+', ' ', token).strip()`
			`return token`

			`# Apply the process to each row in the 'tag_description' column`
			`for index, row in data_mapping.iterrows():`
			`original_description = row['tag_description']`
			`number_tokens = find_tokens_with_numbers(original_description)`

			`# Process each token containing numbers`
			`processed_tokens = [process_token(token) for token in number_tokens]`

			`# Replace the original tokens with processed tokens in the tag_description`
			`new_description = original_description`
			`for original_token, processed_token in zip(number_tokens, processed_tokens):`
			`new_description = new_description.replace(original_token, processed_token)`

			`# Update the data_mapping with the modified description`
			`data_mapping.at[index, 'tag_description'] = new_description`

			`# Save the updated data_mapping to a new CSV file`
			`output_file_path = 'outputs/raw_data_s.csv'`
			`data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')`

			`print(f"Updated data saved to {output_file_path}")`


			`# %%`