{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Updated data saved to ../preprocessed_data.csv\n" ] } ], "source": [ "import pandas as pd\n", "import re\n", "\n", "# Load the data_mapping CSV file\n", "data_mapping_file_path = 'raw_data_s.csv' # Adjust this path to your actual file location\n", "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n", " \n", " # Ensure all values in the 'tag_description' column are strings\n", "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n", "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n", "\n", "# Initial replacement mapping\n", "initial_replacements = {\n", " \"MGE\": \"G/E\",\n", " \"GEN.\": \"G/E\",\n", " \"GEN\": \"G/E\",\n", " \"GE\": \"G/E\",\n", " \"G_E\": \"G/E\",\n", " \"ME\": \"M/E\",\n", " \"M_E\": \"M/E\",\n", " \"S_G\": \"S/G\",\n", " \"T_C\": \"T/C\",\n", " \"TC\": \"T/C\",\n", " \"L_O\": \"L.O\",\n", " \"LO\": \"L.O\",\n", " \"F_O\": \"F.O\",\n", " \"FO\": \"F.O\",\n", " \"D_G\": \"D/G\",\n", " \"DG\": \"D/G\",\n", " \"PP\": \"P/P\"\n", "}\n", "\n", "# Second replacement mapping\n", "second_replacements = {\n", " \"_G/E\": \" G/E\",\n", " \"G/E_\": \"G/E \",\n", " \"_M/E\": \" M/E\",\n", " \"M/E_\": \"M/E \",\n", " \"_S/G\": \" S/G\",\n", " \"S/G_\": \"S/G \",\n", " \"_T/C\": \" T/C\",\n", " \"T/C_\": \"T/C \",\n", " \"_L.O\": \" L.O\",\n", " \"L.O_\": \"L.O \",\n", " \"_F.O\": \" F.O\",\n", " \"F.O_\": \"F.O \",\n", " \"_D/G\": \" D/G\",\n", " \"D/G_\": \"D/G \",\n", " \"DG_\": \"DG \"\n", "}\n", "\n", "# Function to separate numbers from text in a token\n", "def separate_numbers_from_text(description):\n", " # This regex pattern finds occurrences where text is followed by numbers or vice versa\n", " return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n", "\n", "# Function to perform replacements using tokens\n", "def replace_tokens(description, replacements):\n", " tokens = description.split() # Tokenize by spaces\n", " tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary\n", " return ' '.join(tokens)\n", "\n", "# Function to perform replacements for substrings\n", "def replace_substrings(description, replacements):\n", " for old, new in replacements.items():\n", " description = description.replace(old, new)\n", " return description\n", "\n", "# Separate numbers from text before applying replacements\n", "data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n", "\n", "# Apply initial replacements\n", "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n", "\n", "# Apply second replacements as substrings\n", "data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n", "\n", "# Save the updated data_mapping to a new CSV file\n", "output_file_path = '../preprocessed_data.csv'\n", "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n", "\n", "print(f\"Updated data saved to {output_file_path}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "torch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }