# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.16.4 # kernelspec: # display_name: torch # language: python # name: python3 # --- # %% import pandas as pd import re # Load the data_mapping CSV file data_mapping_file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location data_mapping_file_path = 'outputs/raw_data_add_tag.csv' # Adjust this path to your actual file location data_mapping = pd.read_csv(data_mapping_file_path, dtype=str) # Backup the original tag_description data_mapping['org_tag_description'] = data_mapping['tag_description'] # Ensure all values in the 'tag_description' column are strings data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str) data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True) # Function to find tokens containing numbers def find_tokens_with_numbers(description): tokens = description.split() # Tokenize by spaces number_tokens = [token for token in tokens if re.search(r'\d', token)] return number_tokens # Function to process tokens def process_token(token): # Step 1: Replace '_' or '-' adjacent to numbers with spaces token = re.sub(r'(_|-)(?=\d)', ' ', token) token = re.sub(r'(?<=\d)(_|-)', ' ', token) # Step 2: Insert spaces between letters and numbers where no separator exists token = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', token) token = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', token) # Step 3: Handle cases like "NO.1" or "No.1" to become "No. 1" token = re.sub(r'([A-Za-z]+)\.(\d+)', r'\1. \2', token) # Clean multiple spaces and strip token = re.sub(r'\s+', ' ', token).strip() return token # Apply the process to each row in the 'tag_description' column for index, row in data_mapping.iterrows(): original_description = row['tag_description'] number_tokens = find_tokens_with_numbers(original_description) # Process each token containing numbers processed_tokens = [process_token(token) for token in number_tokens] # Replace the original tokens with processed tokens in the tag_description new_description = original_description for original_token, processed_token in zip(number_tokens, processed_tokens): new_description = new_description.replace(original_token, processed_token) # Update the data_mapping with the modified description data_mapping.at[index, 'tag_description'] = new_description # Save the updated data_mapping to a new CSV file output_file_path = 'outputs/raw_data_s.csv' data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig') print(f"Updated data saved to {output_file_path}") # %%