hipom_data_mapping/data_preprocess/rule_base_replacement/replacement.py

104 lines
2.9 KiB
Python

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.4
# kernelspec:
# display_name: torch
# language: python
# name: python3
# ---
# %%
import pandas as pd
import re
# Load the data_mapping CSV file
data_mapping_file_path = 'outputs/raw_data_s.csv' # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
# Ensure all values in the 'tag_description' column are strings
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)
# Initial replacement mapping
initial_replacements = {
"MGE": "G/E",
"GEN.": "G/E",
"GEN": "G/E",
"GE": "G/E",
"G_E": "G/E",
"ME": "M/E",
"M_E": "M/E",
"S_G": "S/G",
"T_C": "T/C",
"TC": "T/C",
"L_O": "L.O",
"LO": "L.O",
"F_O": "F.O",
"FO": "F.O",
"D_G": "D/G",
"DG": "D/G",
"PP": "P/P"
}
# Second replacement mapping
second_replacements = {
"_G/E": " G/E",
"G/E_": "G/E ",
"_M/E": " M/E",
"M/E_": "M/E ",
"_S/G": " S/G",
"S/G_": "S/G ",
"_T/C": " T/C",
"T/C_": "T/C ",
"_L.O": " L.O",
"L.O_": "L.O ",
"_F.O": " F.O",
"F.O_": "F.O ",
"_D/G": " D/G",
"D/G_": "D/G ",
"DG_": "DG "
}
# Function to separate numbers from text in a token
def separate_numbers_from_text(description):
# This regex pattern finds occurrences where text is followed by numbers or vice versa
return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description))
# Function to perform replacements using tokens
def replace_tokens(description, replacements):
tokens = description.split() # Tokenize by spaces
tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary
return ' '.join(tokens)
# Function to perform replacements for substrings
def replace_substrings(description, replacements):
for old, new in replacements.items():
description = description.replace(old, new)
return description
# Separate numbers from text before applying replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)
# Apply initial replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)
# Apply second replacements as substrings
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)
# Save the updated data_mapping to a new CSV file
output_file_path = '../exports/preprocessed_data.csv'
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')
print(f"Updated data saved to {output_file_path}")
# %%