124 lines
4.1 KiB
Plaintext
124 lines
4.1 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Updated data saved to ../preprocessed_data.csv\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import re\n",
|
||
|
"\n",
|
||
|
"# Load the data_mapping CSV file\n",
|
||
|
"data_mapping_file_path = 'raw_data_s.csv' # Adjust this path to your actual file location\n",
|
||
|
"data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)\n",
|
||
|
" \n",
|
||
|
" # Ensure all values in the 'tag_description' column are strings\n",
|
||
|
"data_mapping['tag_description'] = data_mapping['tag_description'].fillna('').astype(str)\n",
|
||
|
"data_mapping['tag_description'] = data_mapping['tag_description'].str.replace(r'[-]', ' ', regex=True)\n",
|
||
|
"\n",
|
||
|
"# Initial replacement mapping\n",
|
||
|
"initial_replacements = {\n",
|
||
|
" \"MGE\": \"G/E\",\n",
|
||
|
" \"GEN.\": \"G/E\",\n",
|
||
|
" \"GEN\": \"G/E\",\n",
|
||
|
" \"GE\": \"G/E\",\n",
|
||
|
" \"G_E\": \"G/E\",\n",
|
||
|
" \"ME\": \"M/E\",\n",
|
||
|
" \"M_E\": \"M/E\",\n",
|
||
|
" \"S_G\": \"S/G\",\n",
|
||
|
" \"T_C\": \"T/C\",\n",
|
||
|
" \"TC\": \"T/C\",\n",
|
||
|
" \"L_O\": \"L.O\",\n",
|
||
|
" \"LO\": \"L.O\",\n",
|
||
|
" \"F_O\": \"F.O\",\n",
|
||
|
" \"FO\": \"F.O\",\n",
|
||
|
" \"D_G\": \"D/G\",\n",
|
||
|
" \"DG\": \"D/G\",\n",
|
||
|
" \"PP\": \"P/P\"\n",
|
||
|
"}\n",
|
||
|
"\n",
|
||
|
"# Second replacement mapping\n",
|
||
|
"second_replacements = {\n",
|
||
|
" \"_G/E\": \" G/E\",\n",
|
||
|
" \"G/E_\": \"G/E \",\n",
|
||
|
" \"_M/E\": \" M/E\",\n",
|
||
|
" \"M/E_\": \"M/E \",\n",
|
||
|
" \"_S/G\": \" S/G\",\n",
|
||
|
" \"S/G_\": \"S/G \",\n",
|
||
|
" \"_T/C\": \" T/C\",\n",
|
||
|
" \"T/C_\": \"T/C \",\n",
|
||
|
" \"_L.O\": \" L.O\",\n",
|
||
|
" \"L.O_\": \"L.O \",\n",
|
||
|
" \"_F.O\": \" F.O\",\n",
|
||
|
" \"F.O_\": \"F.O \",\n",
|
||
|
" \"_D/G\": \" D/G\",\n",
|
||
|
" \"D/G_\": \"D/G \",\n",
|
||
|
" \"DG_\": \"DG \"\n",
|
||
|
"}\n",
|
||
|
"\n",
|
||
|
"# Function to separate numbers from text in a token\n",
|
||
|
"def separate_numbers_from_text(description):\n",
|
||
|
" # This regex pattern finds occurrences where text is followed by numbers or vice versa\n",
|
||
|
" return re.sub(r'(\\d+)(\\D)', r'\\1 \\2', re.sub(r'(\\D)(\\d+)', r'\\1 \\2', description))\n",
|
||
|
"\n",
|
||
|
"# Function to perform replacements using tokens\n",
|
||
|
"def replace_tokens(description, replacements):\n",
|
||
|
" tokens = description.split() # Tokenize by spaces\n",
|
||
|
" tokens = [replacements.get(token, token) for token in tokens] # Replace based on the dictionary\n",
|
||
|
" return ' '.join(tokens)\n",
|
||
|
"\n",
|
||
|
"# Function to perform replacements for substrings\n",
|
||
|
"def replace_substrings(description, replacements):\n",
|
||
|
" for old, new in replacements.items():\n",
|
||
|
" description = description.replace(old, new)\n",
|
||
|
" return description\n",
|
||
|
"\n",
|
||
|
"# Separate numbers from text before applying replacements\n",
|
||
|
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(separate_numbers_from_text)\n",
|
||
|
"\n",
|
||
|
"# Apply initial replacements\n",
|
||
|
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_tokens, replacements=initial_replacements)\n",
|
||
|
"\n",
|
||
|
"# Apply second replacements as substrings\n",
|
||
|
"data_mapping['tag_description'] = data_mapping['tag_description'].apply(replace_substrings, replacements=second_replacements)\n",
|
||
|
"\n",
|
||
|
"# Save the updated data_mapping to a new CSV file\n",
|
||
|
"output_file_path = '../preprocessed_data.csv'\n",
|
||
|
"data_mapping.to_csv(output_file_path, index=False, encoding='utf-8-sig')\n",
|
||
|
"\n",
|
||
|
"print(f\"Updated data saved to {output_file_path}\")\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "torch",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.14"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|