Feat: added Daniel's abbreviations preprocessing to preprocessing

methods
This commit is contained in:
Richard Wong 2024-10-30 11:01:57 +09:00
parent 4715999005
commit c7a02c792c
4 changed files with 164 additions and 0 deletions

View File

@ -0,0 +1 @@
__pycache__

View File

@ -0,0 +1,7 @@
## Purpose:
Perform substitutions on common terms to standardize abbreviations.
## Instructions:
- `python abbreviations_replacer.py`

View File

@ -0,0 +1,43 @@
"""
Author: Daniel Kim
Modified by: Richard Wong
"""
# %%
import re
import pandas as pd
from replacement_dict import replacement_dict
# %%
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
"""Count the number of occurrences of the abbreviation in the list of machine descriptions."""
pattern = re.compile(abbreviation)
count = sum(len(pattern.findall(description)) for description in tag_descriptions)
return count
def replace_abbreviations(tag_descriptions, abbreviations):
"""Replace the abbreviations according to the key-pair value provided."""
replaced_descriptions = []
for description in tag_descriptions:
for abbreviation, replacement in abbreviations.items():
description = re.sub(abbreviation, replacement, description)
replaced_descriptions.append(description)
return replaced_descriptions
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
df = pd.read_csv(file_path)
# %%
# Replace abbreviations
print("running substitution")
tag_descriptions = df['tag_description'].fillna("N/A")
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict)
# print("Descriptions after replacement:", replaced_descriptions)
# %%
df["tag_description"] = replaced_descriptions
df.to_csv("../exports/preprocessed_data.csv", index=False)
print("file saved")
# %%

View File

@ -0,0 +1,113 @@
"""
Author: Daniel Kim
"""
# Abbreviations and their replacements
replacement_dict = {
r'\bLIST\b': 'LIST',
r'\bList\b': 'LIST',
r'\bEXH\.': 'EXHAUST',
r'\bEXH\b': 'EXHAUST',
r'\bEXHAUST\.': 'EXHAUST',
r'\bExhaust\b': 'EXHAUST',
r'\bEXHAUST\b': 'EXHAUST',
r'\bTEMP\.': 'TEMPERATURE',
r'\bTEMP\b': 'TEMPERATURE',
r'\bTEMPERATURE\.': 'TEMPERATURE',
r'\bTEMPERATURE\b': 'TEMPERATURE',
r'\bW\.': 'WATER',
r'\bWATER\b': 'WATER',
r'\bCYL\.': 'CYLINDER',
r'\bcyl\.': 'CYLINDER',
r'\bCYL\b': 'CYLINDER',
r'\bcylinder\b': 'CYLINDER',
r'\bCYLINDER\b': 'CYLINDER',
r'\bCOOL\.': 'COOLING',
r'\bcool\.': 'COOLING',
r'\bcooling\b': 'COOLING',
r'\bCOOLING\b': 'COOLING',
r'\bcooler\b': 'COOLER',
r'\bCOOLER\b': 'COOLER',
r'\bScav\.': 'SCAVENGE',
r'\bSCAV\.': 'SCAVENGE',
r'\bINL\.': 'INLET',
r'\binlet\b': 'INLET',
r'\bINLET\b': 'INLET',
r'\bOUT\.': 'OUTLET',
r'\bOUTL\.': 'OUTLET',
r'\boutlet\b': 'OUTLET',
r'\bOUTLET\b': 'OUTLET',
r'\bPRESS\.': 'PRESSURE',
r'\bPress\.': 'PRESSURE',
r'\bpressure\b': 'PRESSURE',
r'\bPRESSURE\b': 'PRESSURE',
r'\bCLR\b': 'CLEAR',
r'\bENG\.': 'ENGINE',
r'\bENG\b': 'ENGINE',
r'\bENGINE\b': 'ENGINE',
r'\bEngine speed\b': 'ENGINE SPEED',
r'\bEngine running\b': 'ENGINE RUNNING',
r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP',
r'\bEngine room\b': 'ENGINE ROOM',
r'\bM/E\b': 'MAIN_ENGINE',
r'\bME\b': 'MAIN_ENGINE',
r'\bMAIN ENGINE\b': 'MAIN_ENGINE',
r'\bGen\b': 'GENERATOR_ENGINE',
r'\bGE\b': 'GENERATOR_ENGINE',
r'\bG/E\b': 'GENERATOR_ENGINE',
r'\bDG': 'GENERATOR_ENGINE',
r'\bD/G\b': 'GENERATOR_ENGINE',
r'\bGEN\.': 'GENERATOR_ENGINE',
r'\bGENERATOR ENGINE\B': 'GENERATOR_ENGINE',
r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
r'\bENGINE ROOM\b': 'ENGINE ROOM',
r'\bE/R\b': 'ENGINE ROOM',
r'\bNO1\b': 'NO.1',
r'\bNO\.1\b': 'NO.1',
r'\bNo\.1\b': 'NO.1',
r'\bNO2\b': 'NO.2',
r'\bNO\.2\b': 'NO.2',
r'\bNo\.2\b': 'NO.2',
r'\bNO3\b': 'NO.3',
r'\bNO\.3\b': 'NO.3',
r'\bNo\.3\b': 'NO.3',
r'\bNO4\b': 'NO.4',
r'\bNO\.4\b': 'NO.4',
r'\bNo\.4\b': 'NO.4',
r'\bNO5\b': 'NO.5',
r'\bNO\.5\b': 'NO.5',
r'\bNo\.5\b': 'NO.5',
r'\bFLTR\b': 'FILTER',
r'\bLUB\.': 'LUBRICANT',
r'\bM\.G\.O\b': 'MGO',
r'\bMGO\b': 'MGO',
r'\bF\.O\b': 'FUEL OIL',
r'\bFO\b': 'FUEL OIL',
r'\bL\.T\b': 'LOW TEMPERATURE',
r'\bLT\b': 'LOW TEMPERATURE',
r'\bH\.T\b': 'HIGH TEMPERATURE',
r'\bHT\b': 'HIGH TEMPERATURE',
r'\bAUX\.': 'AUXILIARY',
r'\bNO\.2A\b': 'NO.2A',
r'\bNO\.2B\b': 'NO.2B',
r'\bAUX\.BOILER\b': 'AUXILIARY BOILER',
r'\bAUX\. BOILER\b': 'AUXILIARY BOILER',
r'\bWIND\.': 'WINDING',
r'\bWINDING\b': 'WINDING',
r'\bC\.S\.W\b': 'CSW',
r'\bCSW\b': 'CSW',
r'\bVLOT\.': 'VOLTAGE',
r'\bVOLTAGE\b': 'VOLTAGE',
r'\bVOLT\.': 'VOLTAGE',
r'\bFREQ\.': 'FREQUENCY',
r'\bFREQUENCY\b': 'FREQUENCY',
r'\bCURR\.': 'CURRENT',
r'\bCURRENT\b': 'CURRENT',
r'\bH\.F\.O\.': 'HFO',
r'\bTCA\b': 'TURBOCHARGER',
r'\bTCB\b': 'TURBOCHARGER',
r'\bT/C\b': 'TURBOCHARGER',
r'\bTC\b': 'TURBOCHARGER',
r'\bTURBOCHAGER\b': 'TURBOCHARGER',
r'\bTURBOCHARGER\b': 'TURBOCHARGER'
}