hipom_data_mapping/data_preprocess/rule_base_replacement/replacement.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.16.4
#   kernelspec:
#     display_name: torch
#     language: python
#     name: python3
# ---

# %%
import pandas as pd
import re

# Load the data_mapping CSV file


data_mapping_file_path = 'outputs/raw_data_s.csv'  # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)

    # Ensure all values in the 'tag_description' column are strings
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)

# Initial replacement mapping
initial_replacements = {
    "MGE": "G/E",
    "GEN.": "G/E",
    "GEN": "G/E",
    "GE": "G/E",
    "G_E": "G/E",
    "ME": "M/E",
    "M_E": "M/E",
    "S_G": "S/G",
    "T_C": "T/C",
    "TC": "T/C",
    "L_O": "L.O",
    "LO": "L.O",
    "F_O": "F.O",
    "FO": "F.O",
    "D_G": "D/G",
    "DG": "D/G",
    "PP": "P/P"
}

# Second replacement mapping
second_replacements = {
    "_G/E": " G/E",
    "G/E_": "G/E ",
    "_M/E": " M/E",
    "M/E_": "M/E ",
    "_S/G": " S/G",
    "S/G_": "S/G ",
    "_T/C": " T/C",
    "T/C_": "T/C ",
    "_L.O": " L.O",
    "L.O_": "L.O ",
    "_F.O": " F.O",
    "F.O_": "F.O ",
    "_D/G": " D/G",
    "D/G_": "D/G ",
    "DG_": "DG "
}

# Function to separate numbers from text in a token
def separate_numbers_from_text(description):
    # This regex pattern finds occurrences where text is followed by numbers or vice versa
    return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description))

# Function to perform replacements using tokens
def replace_tokens(description, replacements):
    tokens = description.split()  # Tokenize by spaces
    tokens = [replacements.get(token, token) for token in tokens]  # Replace based on the dictionary
    return ' '.join(tokens)

# Function to perform replacements for substrings
def replace_substrings(description, replacements):
    for old, new in replacements.items():
        description = description.replace(old, new)
    return description

# Separate numbers from text before applying replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)

# Apply initial replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)

# Apply second replacements as substrings
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)

# Save the updated data_mapping to a new CSV file
output_file_path = '../exports/preprocessed_data.csv'
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f"Updated data saved to {output_file_path}")


# %%