# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.16.4 # kernelspec: # display_name: torch # language: python # name: python3 # --- # %% import pandas as pd import re # Load the data_mapping CSV file data_mapping_file_path = 'outputs/raw_data_s.csv' # Adjust this path to your actual file location data_mapping = pd.read_csv(data_mapping_file_path, dtype=str) # Ensure all values in the 'tag_description' column are strings data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str) data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True) # Initial replacement mapping initial_replacements = { "MGE": "G/E", "GEN.": "G/E", "GEN": "G/E", "GE": "G/E", "G_E": "G/E", "ME": "M/E", "M_E": "M/E", "S_G": "S/G", "T_C": "T/C", "TC": "T/C", "L_O": "L.O", "LO": "L.O", "F_O": "F.O", "FO": "F.O", "D_G": "D/G", "DG": "D/G", "PP": "P/P" } # Second replacement mapping second_replacements = { "_G/E": " G/E", "G/E_": "G/E ", "_M/E": " M/E", "M/E_": "M/E ", "_S/G": " S/G", "S/G_": "S/G ", "_T/C": " T/C", "T/C_": "T/C ", "_L.O": " L.O", "L.O_": "L.O ", "_F.O": " F.O", "F.O_": "F.O ", "_D/G": " D/G", "D/G_": "D/G ", "DG_": "DG " } # Function to separate numbers from text in a token def separate_numbers_from_text(description): # This regex pattern finds occurrences where text is followed by numbers or vice versa return re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', description)) # Function to perform replacements using tokens def replace_tokens(description, replacements): tokens = description.split() # Tokenize by spaces tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary return ' '.join(tokens) # Function to perform replacements for substrings def replace_substrings(description, replacements): for old, new in replacements.items(): description = description.replace(old, new) return description # Separate numbers from text before applying replacements data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text) # Apply initial replacements data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements) # Apply second replacements as substrings data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements) # Save the updated data_mapping to a new CSV file output_file_path = '../exports/preprocessed_data.csv' data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig') print(f"Updated data saved to {output_file_path}") # %%