hipom_data_mapping/data_preprocess/rule_base_replacement/separate_number.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.16.4
#   kernelspec:
#     display_name: torch
#     language: python
#     name: python3
# ---

# %%
import pandas as pd
import re

# Load the data_mapping CSV file
data_mapping_file_path = '../../data_import/exports/raw_data.csv'  # Adjust this path to your actual file location
data_mapping_file_path = 'outputs/raw_data_add_tag.csv'  # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)

# Backup the original tag_description
data_mapping['org_tag_description'] = data_mapping['tag_description']

# Ensure all values in the 'tag_description' column are strings
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)

# Function to find tokens containing numbers
def find_tokens_with_numbers(description):
    tokens = description.split()  # Tokenize by spaces
    number_tokens = [token for token in tokens if re.search(r'\d', token)]
    return number_tokens

# Function to process tokens
def process_token(token):
    # Step 1: Replace '_' or '-' adjacent to numbers with spaces
    token = re.sub(r'(_|-)(?=\d)', ' ', token)
    token = re.sub(r'(?<=\d)(_|-)', ' ', token)

    # Step 2: Insert spaces between letters and numbers where no separator exists
    token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token)
    token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token)

    # Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1"
    token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token)

    # Clean multiple spaces and strip
    token = re.sub(r'\s+', ' ', token).strip()
    return token

# Apply the process to each row in the 'tag_description' column
for index, row in data_mapping.iterrows():
    original_description = row['tag_description']
    number_tokens = find_tokens_with_numbers(original_description)

    # Process each token containing numbers
    processed_tokens = [process_token(token) for token in number_tokens]

    # Replace the original tokens with processed tokens in the tag_description
    new_description = original_description
    for original_token, processed_token in zip(number_tokens, processed_tokens):
        new_description = new_description.replace(original_token, processed_token)

    # Update the data_mapping with the modified description
    data_mapping.at[index, 'tag_description'] = new_description

# Save the updated data_mapping to a new CSV file
output_file_path = 'outputs/raw_data_s.csv'
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f"Updated data saved to {output_file_path}")


# %%