104 lines
2.9 KiB
Python
104 lines
2.9 KiB
Python
|
# ---
|
||
|
# jupyter:
|
||
|
# jupytext:
|
||
|
# formats: ipynb,py:percent
|
||
|
# text_representation:
|
||
|
# extension: .py
|
||
|
# format_name: percent
|
||
|
# format_version: '1.3'
|
||
|
# jupytext_version: 1.16.4
|
||
|
# kernelspec:
|
||
|
# display_name: torch
|
||
|
# language: python
|
||
|
# name: python3
|
||
|
# ---
|
||
|
|
||
|
# %%
|
||
|
import pandas as pd
|
||
|
import re
|
||
|
|
||
|
# Load the data_mapping CSV file
|
||
|
|
||
|
|
||
|
data_mapping_file_path = 'outputs/raw_data_s.csv' # Adjust this path to your actual file location
|
||
|
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
|
||
|
|
||
|
# Ensure all values in the 'tag_description' column are strings
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)
|
||
|
|
||
|
# Initial replacement mapping
|
||
|
initial_replacements = {
|
||
|
"MGE": "G/E",
|
||
|
"GEN.": "G/E",
|
||
|
"GEN": "G/E",
|
||
|
"GE": "G/E",
|
||
|
"G_E": "G/E",
|
||
|
"ME": "M/E",
|
||
|
"M_E": "M/E",
|
||
|
"S_G": "S/G",
|
||
|
"T_C": "T/C",
|
||
|
"TC": "T/C",
|
||
|
"L_O": "L.O",
|
||
|
"LO": "L.O",
|
||
|
"F_O": "F.O",
|
||
|
"FO": "F.O",
|
||
|
"D_G": "D/G",
|
||
|
"DG": "D/G",
|
||
|
"PP": "P/P"
|
||
|
}
|
||
|
|
||
|
# Second replacement mapping
|
||
|
second_replacements = {
|
||
|
"_G/E": " G/E",
|
||
|
"G/E_": "G/E ",
|
||
|
"_M/E": " M/E",
|
||
|
"M/E_": "M/E ",
|
||
|
"_S/G": " S/G",
|
||
|
"S/G_": "S/G ",
|
||
|
"_T/C": " T/C",
|
||
|
"T/C_": "T/C ",
|
||
|
"_L.O": " L.O",
|
||
|
"L.O_": "L.O ",
|
||
|
"_F.O": " F.O",
|
||
|
"F.O_": "F.O ",
|
||
|
"_D/G": " D/G",
|
||
|
"D/G_": "D/G ",
|
||
|
"DG_": "DG "
|
||
|
}
|
||
|
|
||
|
# Function to separate numbers from text in a token
|
||
|
def separate_numbers_from_text(description):
|
||
|
# This regex pattern finds occurrences where text is followed by numbers or vice versa
|
||
|
return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description))
|
||
|
|
||
|
# Function to perform replacements using tokens
|
||
|
def replace_tokens(description, replacements):
|
||
|
tokens = description.split() # Tokenize by spaces
|
||
|
tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary
|
||
|
return ' '.join(tokens)
|
||
|
|
||
|
# Function to perform replacements for substrings
|
||
|
def replace_substrings(description, replacements):
|
||
|
for old, new in replacements.items():
|
||
|
description = description.replace(old, new)
|
||
|
return description
|
||
|
|
||
|
# Separate numbers from text before applying replacements
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)
|
||
|
|
||
|
# Apply initial replacements
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)
|
||
|
|
||
|
# Apply second replacements as substrings
|
||
|
data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)
|
||
|
|
||
|
# Save the updated data_mapping to a new CSV file
|
||
|
output_file_path = '../exports/preprocessed_data.csv'
|
||
|
data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')
|
||
|
|
||
|
print(f"Updated data saved to {output_file_path}")
|
||
|
|
||
|
|
||
|
# %%
|