79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
|
# ---
|
||
|
# jupyter:
|
||
|
# jupytext:
|
||
|
# formats: ipynb,py:percent
|
||
|
# text_representation:
|
||
|
# extension: .py
|
||
|
# format_name: percent
|
||
|
# format_version: '1.3'
|
||
|
# jupytext_version: 1.16.4
|
||
|
# kernelspec:
|
||
|
# display_name: torch
|
||
|
# language: python
|
||
|
# name: python3
|
||
|
# ---
|
||
|
|
||
|
# %%
|
||
|
import pandas as pd
|
||
|
import re
|
||
|
|
||
|
# Load the data_mapping CSV file
|
||
|
data_mapping_file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||
|
data_mapping_file_path = 'outputs/raw_data_add_tag.csv' # Adjust this path to your actual file location
|
||
|
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
|
||
|
|
||
|
# Backup the original tag_description
|
||
|
data_mapping['org_tag_description'] = data_mapping['tag_description']
|
||
|
|
||
|
# Ensure all values in the 'tag_description' column are strings
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)
|
||
|
|
||
|
# Function to find tokens containing numbers
|
||
|
def find_tokens_with_numbers(description):
|
||
|
tokens = description.split() # Tokenize by spaces
|
||
|
number_tokens = [token for token in tokens if re.search(r'\d', token)]
|
||
|
return number_tokens
|
||
|
|
||
|
# Function to process tokens
|
||
|
def process_token(token):
|
||
|
# Step 1: Replace '_' or '-' adjacent to numbers with spaces
|
||
|
token = re.sub(r'(_|-)(?=\d)', ' ', token)
|
||
|
token = re.sub(r'(?<=\d)(_|-)', ' ', token)
|
||
|
|
||
|
# Step 2: Insert spaces between letters and numbers where no separator exists
|
||
|
token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token)
|
||
|
token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token)
|
||
|
|
||
|
# Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1"
|
||
|
token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token)
|
||
|
|
||
|
# Clean multiple spaces and strip
|
||
|
token = re.sub(r'\s+', ' ', token).strip()
|
||
|
return token
|
||
|
|
||
|
# Apply the process to each row in the 'tag_description' column
|
||
|
for index, row in data_mapping.iterrows():
|
||
|
original_description = row['tag_description']
|
||
|
number_tokens = find_tokens_with_numbers(original_description)
|
||
|
|
||
|
# Process each token containing numbers
|
||
|
processed_tokens = [process_token(token) for token in number_tokens]
|
||
|
|
||
|
# Replace the original tokens with processed tokens in the tag_description
|
||
|
new_description = original_description
|
||
|
for original_token, processed_token in zip(number_tokens, processed_tokens):
|
||
|
new_description = new_description.replace(original_token, processed_token)
|
||
|
|
||
|
# Update the data_mapping with the modified description
|
||
|
data_mapping.at[index, 'tag_description'] = new_description
|
||
|
|
||
|
# Save the updated data_mapping to a new CSV file
|
||
|
output_file_path = 'outputs/raw_data_s.csv'
|
||
|
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')
|
||
|
|
||
|
print(f"Updated data saved to {output_file_path}")
|
||
|
|
||
|
|
||
|
# %%
|