hipom_data_mapping/data_preprocess/rule_base_replacement/separate_number.py

79 lines
2.8 KiB
Python

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.4
# kernelspec:
# display_name: torch
# language: python
# name: python3
# ---
# %%
import pandas as pd
import re
# Load the data_mapping CSV file
data_mapping_file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
data_mapping_file_path = 'outputs/raw_data_add_tag.csv' # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
# Backup the original tag_description
data_mapping['org_tag_description'] = data_mapping['tag_description']
# Ensure all values in the 'tag_description' column are strings
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)
# Function to find tokens containing numbers
def find_tokens_with_numbers(description):
tokens = description.split() # Tokenize by spaces
number_tokens = [token for token in tokens if re.search(r'\d', token)]
return number_tokens
# Function to process tokens
def process_token(token):
# Step 1: Replace '_' or '-' adjacent to numbers with spaces
token = re.sub(r'(_|-)(?=\d)', ' ', token)
token = re.sub(r'(?<=\d)(_|-)', ' ', token)
# Step 2: Insert spaces between letters and numbers where no separator exists
token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token)
token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token)
# Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1"
token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token)
# Clean multiple spaces and strip
token = re.sub(r'\s+', ' ', token).strip()
return token
# Apply the process to each row in the 'tag_description' column
for index, row in data_mapping.iterrows():
original_description = row['tag_description']
number_tokens = find_tokens_with_numbers(original_description)
# Process each token containing numbers
processed_tokens = [process_token(token) for token in number_tokens]
# Replace the original tokens with processed tokens in the tag_description
new_description = original_description
for original_token, processed_token in zip(number_tokens, processed_tokens):
new_description = new_description.replace(original_token, processed_token)
# Update the data_mapping with the modified description
data_mapping.at[index, 'tag_description'] = new_description
# Save the updated data_mapping to a new CSV file
output_file_path = 'outputs/raw_data_s.csv'
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')
print(f"Updated data saved to {output_file_path}")
# %%