hipom_data_mapping/data_preprocess/rule_base_replacement/replacement.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.16.4
#   kernelspec:
#     display_name: torch
#     language: python
#     name: python3
# ---

# %%
import pandas as pd
import re

# Load the data_mapping CSV file


data_mapping_file_path = 'outputs/raw_data_s.csv'  # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
    
    # Ensure all values in the 'tag_description' column are strings
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)

# Initial replacement mapping
initial_replacements = {
    "MGE": "G/E",
    "GEN.": "G/E",
    "GEN": "G/E",
    "GE": "G/E",
    "G_E": "G/E",
    "ME": "M/E",
    "M_E": "M/E",
    "S_G": "S/G",
    "T_C": "T/C",
    "TC": "T/C",
    "L_O": "L.O",
    "LO": "L.O",
    "F_O": "F.O",
    "FO": "F.O",
    "D_G": "D/G",
    "DG": "D/G",
    "PP": "P/P"
}

# Second replacement mapping
second_replacements = {
    "_G/E": " G/E",
    "G/E_": "G/E ",
    "_M/E": " M/E",
    "M/E_": "M/E ",
    "_S/G": " S/G",
    "S/G_": "S/G ",
    "_T/C": " T/C",
    "T/C_": "T/C ",
    "_L.O": " L.O",
    "L.O_": "L.O ",
    "_F.O": " F.O",
    "F.O_": "F.O ",
    "_D/G": " D/G",
    "D/G_": "D/G ",
    "DG_": "DG "
}

# Function to separate numbers from text in a token
def separate_numbers_from_text(description):
    # This regex pattern finds occurrences where text is followed by numbers or vice versa
    return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description))

# Function to perform replacements using tokens
def replace_tokens(description, replacements):
    tokens = description.split()  # Tokenize by spaces
    tokens = [replacements.get(token, token) for token in tokens]  # Replace based on the dictionary
    return ' '.join(tokens)

# Function to perform replacements for substrings
def replace_substrings(description, replacements):
    for old, new in replacements.items():
        description = description.replace(old, new)
    return description

# Separate numbers from text before applying replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)

# Apply initial replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)

# Apply second replacements as substrings
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)

# Save the updated data_mapping to a new CSV file
output_file_path = '../exports/preprocessed_data.csv'
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f"Updated data saved to {output_file_path}")


# %%
Chore: changed ipynb to py files in the data_preprocess folder Doc: added descriptions and instructions for the data_preprocess folder 2024-10-29 22:55:22 +09:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
			`# jupytext_version: 1.16.4`
			`# kernelspec:`
			`# display_name: torch`
			`# language: python`
			`# name: python3`
			`# ---`

			`# %%`
			`import pandas as pd`
			`import re`

			`# Load the data_mapping CSV file`


			`data_mapping_file_path = 'outputs/raw_data_s.csv' # Adjust this path to your actual file location`
			`data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)`

			`# Ensure all values in the 'tag_description' column are strings`
			`data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)`
			`data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)`

			`# Initial replacement mapping`
			`initial_replacements = {`
			`"MGE": "G/E",`
			`"GEN.": "G/E",`
			`"GEN": "G/E",`
			`"GE": "G/E",`
			`"G_E": "G/E",`
			`"ME": "M/E",`
			`"M_E": "M/E",`
			`"S_G": "S/G",`
			`"T_C": "T/C",`
			`"TC": "T/C",`
			`"L_O": "L.O",`
			`"LO": "L.O",`
			`"F_O": "F.O",`
			`"FO": "F.O",`
			`"D_G": "D/G",`
			`"DG": "D/G",`
			`"PP": "P/P"`
			`}`

			`# Second replacement mapping`
			`second_replacements = {`
			`"_G/E": " G/E",`
			`"G/E_": "G/E ",`
			`"_M/E": " M/E",`
			`"M/E_": "M/E ",`
			`"_S/G": " S/G",`
			`"S/G_": "S/G ",`
			`"_T/C": " T/C",`
			`"T/C_": "T/C ",`
			`"_L.O": " L.O",`
			`"L.O_": "L.O ",`
			`"_F.O": " F.O",`
			`"F.O_": "F.O ",`
			`"_D/G": " D/G",`
			`"D/G_": "D/G ",`
			`"DG_": "DG "`
			`}`

			`# Function to separate numbers from text in a token`
			`def separate_numbers_from_text(description):`
			`# This regex pattern finds occurrences where text is followed by numbers or vice versa`
			`return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description))`

			`# Function to perform replacements using tokens`
			`def replace_tokens(description, replacements):`
			`tokens = description.split() # Tokenize by spaces`
			`tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary`
			`return ' '.join(tokens)`

			`# Function to perform replacements for substrings`
			`def replace_substrings(description, replacements):`
			`for old, new in replacements.items():`
			`description = description.replace(old, new)`
			`return description`

			`# Separate numbers from text before applying replacements`
			`data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)`

			`# Apply initial replacements`
			`data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)`

			`# Apply second replacements as substrings`
			`data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)`

			`# Save the updated data_mapping to a new CSV file`
			`output_file_path = '../exports/preprocessed_data.csv'`
			`data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')`

			`print(f"Updated data saved to {output_file_path}")`


			`# %%`