hipom_data_mapping/data_preprocess/rule_base_replacement/2.seperate_number.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Updated data saved to raw_data_s.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "# Load the data_mapping CSV file\n",
    "data_mapping_file_path = '../../data_import/raw_data.csv'  # Adjust this path to your actual file location\n",
    "# data_mapping_file_path = 'raw_data_add_tag.csv'  # Adjust this path to your actual file location\n",
    "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
    "\n",
    "# Backup the original tag_description\n",
    "data_mapping['org_tag_description'] = data_mapping['tag_description']\n",
    "\n",
    "# Ensure all values in the 'tag_description' column are strings\n",
    "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
    "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n",
    "\n",
    "# Function to find tokens containing numbers\n",
    "def find_tokens_with_numbers(description):\n",
    "    tokens = description.split()  # Tokenize by spaces\n",
    "    number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n",
    "    return number_tokens\n",
    "\n",
    "# Function to process tokens\n",
    "def process_token(token):\n",
    "    # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n",
    "    token = re.sub(r'(_|-)(?=\\d)', ' ', token)\n",
    "    token = re.sub(r'(?<=\\d)(_|-)', ' ', token)\n",
    "\n",
    "    # Step 2: Insert spaces between letters and numbers where no separator exists\n",
    "    token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n",
    "    token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n",
    "\n",
    "    # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n",
    "    token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n",
    "\n",
    "    # Clean multiple spaces and strip\n",
    "    token = re.sub(r'\\s+', ' ', token).strip()\n",
    "    return token\n",
    "\n",
    "# Apply the process to each row in the 'tag_description' column\n",
    "for index, row in data_mapping.iterrows():\n",
    "    original_description = row['tag_description']\n",
    "    number_tokens = find_tokens_with_numbers(original_description)\n",
    "\n",
    "    # Process each token containing numbers\n",
    "    processed_tokens = [process_token(token) for token in number_tokens]\n",
    "\n",
    "    # Replace the original tokens with processed tokens in the tag_description\n",
    "    new_description = original_description\n",
    "    for original_token, processed_token in zip(number_tokens, processed_tokens):\n",
    "        new_description = new_description.replace(original_token, processed_token)\n",
    "\n",
    "    # Update the data_mapping with the modified description\n",
    "    data_mapping.at[index, 'tag_description'] = new_description\n",
    "\n",
    "# Save the updated data_mapping to a new CSV file\n",
    "output_file_path = 'raw_data_s.csv'\n",
    "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
    "\n",
    "print(f\"Updated data saved to {output_file_path}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}