hipom_data_mapping/data_preprocess/rule_base_replacement/2.seperate_number.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Updated data saved to raw_data_s.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "# Load the data_mapping CSV file\n",
    "data_mapping_file_path = '../../data_import/raw_data.csv'  # Adjust this path to your actual file location\n",
    "# data_mapping_file_path = 'raw_data_add_tag.csv'  # Adjust this path to your actual file location\n",
    "data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
    "\n",
    "# Backup the original tag_description\n",
    "data_mapping['org_tag_description'] = data_mapping['tag_description']\n",
    "\n",
    "# Ensure all values in the 'tag_description' column are strings\n",
    "data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
    "data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n",
    "\n",
    "# Function to find tokens containing numbers\n",
    "def find_tokens_with_numbers(description):\n",
    "    tokens = description.split()  # Tokenize by spaces\n",
    "    number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n",
    "    return number_tokens\n",
    "\n",
    "# Function to process tokens\n",
    "def process_token(token):\n",
    "    # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n",
    "    token = re.sub(r'(_|-)(?=\\d)', ' ', token)\n",
    "    token = re.sub(r'(?<=\\d)(_|-)', ' ', token)\n",
    "\n",
    "    # Step 2: Insert spaces between letters and numbers where no separator exists\n",
    "    token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n",
    "    token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n",
    "\n",
    "    # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n",
    "    token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n",
    "\n",
    "    # Clean multiple spaces and strip\n",
    "    token = re.sub(r'\\s+', ' ', token).strip()\n",
    "    return token\n",
    "\n",
    "# Apply the process to each row in the 'tag_description' column\n",
    "for index, row in data_mapping.iterrows():\n",
    "    original_description = row['tag_description']\n",
    "    number_tokens = find_tokens_with_numbers(original_description)\n",
    "\n",
    "    # Process each token containing numbers\n",
    "    processed_tokens = [process_token(token) for token in number_tokens]\n",
    "\n",
    "    # Replace the original tokens with processed tokens in the tag_description\n",
    "    new_description = original_description\n",
    "    for original_token, processed_token in zip(number_tokens, processed_tokens):\n",
    "        new_description = new_description.replace(original_token, processed_token)\n",
    "\n",
    "    # Update the data_mapping with the modified description\n",
    "    data_mapping.at[index, 'tag_description'] = new_description\n",
    "\n",
    "# Save the updated data_mapping to a new CSV file\n",
    "output_file_path = 'raw_data_s.csv'\n",
    "data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
    "\n",
    "print(f\"Updated data saved to {output_file_path}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
[TASK] init 2024-08-26 19:51:11 +09:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Updated data saved to raw_data_s.csv\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"import pandas as pd\n",`
			`"import re\n",`
			`"\n",`
			`"# Load the data_mapping CSV file\n",`
			`"data_mapping_file_path = '../../data_import/raw_data.csv' # Adjust this path to your actual file location\n",`
			`"# data_mapping_file_path = 'raw_data_add_tag.csv' # Adjust this path to your actual file location\n",`
			`"data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",`
			`"\n",`
			`"# Backup the original tag_description\n",`
			`"data_mapping['org_tag_description'] = data_mapping['tag_description']\n",`
			`"\n",`
			`"# Ensure all values in the 'tag_description' column are strings\n",`
			`"data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",`
			`"data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[()]', ' ', regex=True)\n",`
			`"\n",`
			`"# Function to find tokens containing numbers\n",`
			`"def find_tokens_with_numbers(description):\n",`
			`" tokens = description.split() # Tokenize by spaces\n",`
			`" number_tokens = [token for token in tokens if re.search(r'\\d', token)]\n",`
			`" return number_tokens\n",`
			`"\n",`
			`"# Function to process tokens\n",`
			`"def process_token(token):\n",`
			`" # Step 1: Replace '_' or '-' adjacent to numbers with spaces\n",`
			`" token = re.sub(r'(_\|-)(?=\\d)', ' ', token)\n",`
			`" token = re.sub(r'(?<=\\d)(_\|-)', ' ', token)\n",`
			`"\n",`
			`" # Step 2: Insert spaces between letters and numbers where no separator exists\n",`
			`" token = re.sub(r'([A-Za-z])(\\d+)', r'\\1 \\2', token)\n",`
			`" token = re.sub(r'(\\d+)([A-Za-z])', r'\\1 \\2', token)\n",`
			`"\n",`
			`" # Step 3: Handle cases like \"NO.1\" or \"No.1\" to become \"No. 1\"\n",`
			`" token = re.sub(r'([A-Za-z]+)\\.(\\d+)', r'\\1. \\2', token)\n",`
			`"\n",`
			`" # Clean multiple spaces and strip\n",`
			`" token = re.sub(r'\\s+', ' ', token).strip()\n",`
			`" return token\n",`
			`"\n",`
			`"# Apply the process to each row in the 'tag_description' column\n",`
			`"for index, row in data_mapping.iterrows():\n",`
			`" original_description = row['tag_description']\n",`
			`" number_tokens = find_tokens_with_numbers(original_description)\n",`
			`"\n",`
			`" # Process each token containing numbers\n",`
			`" processed_tokens = [process_token(token) for token in number_tokens]\n",`
			`"\n",`
			`" # Replace the original tokens with processed tokens in the tag_description\n",`
			`" new_description = original_description\n",`
			`" for original_token, processed_token in zip(number_tokens, processed_tokens):\n",`
			`" new_description = new_description.replace(original_token, processed_token)\n",`
			`"\n",`
			`" # Update the data_mapping with the modified description\n",`
			`" data_mapping.at[index, 'tag_description'] = new_description\n",`
			`"\n",`
			`"# Save the updated data_mapping to a new CSV file\n",`
			`"output_file_path = 'raw_data_s.csv'\n",`
			`"data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",`
			`"\n",`
			`"print(f\"Updated data saved to {output_file_path}\")\n"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "torch",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.14"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`