hipom_data_mapping/data_preprocess/rule_base_replacement/3.replacement.ipynb

124 lines
4.1 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Updated data saved to ../preprocessed_data.csv\n"
]
}
],
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"# Load the data_mapping CSV file\n",
"data_mapping_file_path = 'raw_data_s.csv' # Adjust this path to your actual file location\n",
"data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
" \n",
" # Ensure all values in the 'tag_description' column are strings\n",
"data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
"data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n",
"\n",
"# Initial replacement mapping\n",
"initial_replacements = {\n",
" \"MGE\": \"G/E\",\n",
" \"GEN.\": \"G/E\",\n",
" \"GEN\": \"G/E\",\n",
" \"GE\": \"G/E\",\n",
" \"G_E\": \"G/E\",\n",
" \"ME\": \"M/E\",\n",
" \"M_E\": \"M/E\",\n",
" \"S_G\": \"S/G\",\n",
" \"T_C\": \"T/C\",\n",
" \"TC\": \"T/C\",\n",
" \"L_O\": \"L.O\",\n",
" \"LO\": \"L.O\",\n",
" \"F_O\": \"F.O\",\n",
" \"FO\": \"F.O\",\n",
" \"D_G\": \"D/G\",\n",
" \"DG\": \"D/G\",\n",
" \"PP\": \"P/P\"\n",
"}\n",
"\n",
"# Second replacement mapping\n",
"second_replacements = {\n",
" \"_G/E\": \" G/E\",\n",
" \"G/E_\": \"G/E \",\n",
" \"_M/E\": \" M/E\",\n",
" \"M/E_\": \"M/E \",\n",
" \"_S/G\": \" S/G\",\n",
" \"S/G_\": \"S/G \",\n",
" \"_T/C\": \" T/C\",\n",
" \"T/C_\": \"T/C \",\n",
" \"_L.O\": \" L.O\",\n",
" \"L.O_\": \"L.O \",\n",
" \"_F.O\": \" F.O\",\n",
" \"F.O_\": \"F.O \",\n",
" \"_D/G\": \" D/G\",\n",
" \"D/G_\": \"D/G \",\n",
" \"DG_\": \"DG \"\n",
"}\n",
"\n",
"# Function to separate numbers from text in a token\n",
"def separate_numbers_from_text(description):\n",
" # This regex pattern finds occurrences where text is followed by numbers or vice versa\n",
" return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n",
"\n",
"# Function to perform replacements using tokens\n",
"def replace_tokens(description, replacements):\n",
" tokens = description.split() # Tokenize by spaces\n",
" tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary\n",
" return ' '.join(tokens)\n",
"\n",
"# Function to perform replacements for substrings\n",
"def replace_substrings(description, replacements):\n",
" for old, new in replacements.items():\n",
" description = description.replace(old, new)\n",
" return description\n",
"\n",
"# Separate numbers from text before applying replacements\n",
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n",
"\n",
"# Apply initial replacements\n",
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n",
"\n",
"# Apply second replacements as substrings\n",
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n",
"\n",
"# Save the updated data_mapping to a new CSV file\n",
"output_file_path = '../preprocessed_data.csv'\n",
"data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
"\n",
"print(f\"Updated data saved to {output_file_path}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}