Feat: added Daniel's abbreviations preprocessing to preprocessing
methods
This commit is contained in:
parent
4715999005
commit
c7a02c792c
|
@ -0,0 +1 @@
|
|||
__pycache__
|
|
@ -0,0 +1,7 @@
|
|||
## Purpose:
|
||||
|
||||
Perform substitutions on common terms to standardize abbreviations.
|
||||
|
||||
## Instructions:
|
||||
|
||||
- `python abbreviations_replacer.py`
|
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
Author: Daniel Kim
|
||||
Modified by: Richard Wong
|
||||
"""
|
||||
# %%
|
||||
import re
|
||||
import pandas as pd
|
||||
from replacement_dict import replacement_dict
|
||||
|
||||
# %%
|
||||
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
|
||||
"""Count the number of occurrences of the abbreviation in the list of machine descriptions."""
|
||||
pattern = re.compile(abbreviation)
|
||||
count = sum(len(pattern.findall(description)) for description in tag_descriptions)
|
||||
return count
|
||||
|
||||
def replace_abbreviations(tag_descriptions, abbreviations):
|
||||
"""Replace the abbreviations according to the key-pair value provided."""
|
||||
replaced_descriptions = []
|
||||
for description in tag_descriptions:
|
||||
for abbreviation, replacement in abbreviations.items():
|
||||
description = re.sub(abbreviation, replacement, description)
|
||||
|
||||
replaced_descriptions.append(description)
|
||||
return replaced_descriptions
|
||||
|
||||
|
||||
# %%
|
||||
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||||
df = pd.read_csv(file_path)
|
||||
|
||||
# %%
|
||||
# Replace abbreviations
|
||||
print("running substitution")
|
||||
tag_descriptions = df['tag_description'].fillna("N/A")
|
||||
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict)
|
||||
# print("Descriptions after replacement:", replaced_descriptions)
|
||||
|
||||
# %%
|
||||
df["tag_description"] = replaced_descriptions
|
||||
df.to_csv("../exports/preprocessed_data.csv", index=False)
|
||||
print("file saved")
|
||||
# %%
|
|
@ -0,0 +1,113 @@
|
|||
"""
|
||||
Author: Daniel Kim
|
||||
"""
|
||||
|
||||
# Abbreviations and their replacements
|
||||
replacement_dict = {
|
||||
r'\bLIST\b': 'LIST',
|
||||
r'\bList\b': 'LIST',
|
||||
r'\bEXH\.': 'EXHAUST',
|
||||
r'\bEXH\b': 'EXHAUST',
|
||||
r'\bEXHAUST\.': 'EXHAUST',
|
||||
r'\bExhaust\b': 'EXHAUST',
|
||||
r'\bEXHAUST\b': 'EXHAUST',
|
||||
r'\bTEMP\.': 'TEMPERATURE',
|
||||
r'\bTEMP\b': 'TEMPERATURE',
|
||||
r'\bTEMPERATURE\.': 'TEMPERATURE',
|
||||
r'\bTEMPERATURE\b': 'TEMPERATURE',
|
||||
r'\bW\.': 'WATER',
|
||||
r'\bWATER\b': 'WATER',
|
||||
r'\bCYL\.': 'CYLINDER',
|
||||
r'\bcyl\.': 'CYLINDER',
|
||||
r'\bCYL\b': 'CYLINDER',
|
||||
r'\bcylinder\b': 'CYLINDER',
|
||||
r'\bCYLINDER\b': 'CYLINDER',
|
||||
r'\bCOOL\.': 'COOLING',
|
||||
r'\bcool\.': 'COOLING',
|
||||
r'\bcooling\b': 'COOLING',
|
||||
r'\bCOOLING\b': 'COOLING',
|
||||
r'\bcooler\b': 'COOLER',
|
||||
r'\bCOOLER\b': 'COOLER',
|
||||
r'\bScav\.': 'SCAVENGE',
|
||||
r'\bSCAV\.': 'SCAVENGE',
|
||||
r'\bINL\.': 'INLET',
|
||||
r'\binlet\b': 'INLET',
|
||||
r'\bINLET\b': 'INLET',
|
||||
r'\bOUT\.': 'OUTLET',
|
||||
r'\bOUTL\.': 'OUTLET',
|
||||
r'\boutlet\b': 'OUTLET',
|
||||
r'\bOUTLET\b': 'OUTLET',
|
||||
r'\bPRESS\.': 'PRESSURE',
|
||||
r'\bPress\.': 'PRESSURE',
|
||||
r'\bpressure\b': 'PRESSURE',
|
||||
r'\bPRESSURE\b': 'PRESSURE',
|
||||
r'\bCLR\b': 'CLEAR',
|
||||
r'\bENG\.': 'ENGINE',
|
||||
r'\bENG\b': 'ENGINE',
|
||||
r'\bENGINE\b': 'ENGINE',
|
||||
r'\bEngine speed\b': 'ENGINE SPEED',
|
||||
r'\bEngine running\b': 'ENGINE RUNNING',
|
||||
r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP',
|
||||
r'\bEngine room\b': 'ENGINE ROOM',
|
||||
r'\bM/E\b': 'MAIN_ENGINE',
|
||||
r'\bME\b': 'MAIN_ENGINE',
|
||||
r'\bMAIN ENGINE\b': 'MAIN_ENGINE',
|
||||
r'\bGen\b': 'GENERATOR_ENGINE',
|
||||
r'\bGE\b': 'GENERATOR_ENGINE',
|
||||
r'\bG/E\b': 'GENERATOR_ENGINE',
|
||||
r'\bDG': 'GENERATOR_ENGINE',
|
||||
r'\bD/G\b': 'GENERATOR_ENGINE',
|
||||
r'\bGEN\.': 'GENERATOR_ENGINE',
|
||||
r'\bGENERATOR ENGINE\B': 'GENERATOR_ENGINE',
|
||||
r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE',
|
||||
r'\bENGINE ROOM\b': 'ENGINE ROOM',
|
||||
r'\bE/R\b': 'ENGINE ROOM',
|
||||
r'\bNO1\b': 'NO.1',
|
||||
r'\bNO\.1\b': 'NO.1',
|
||||
r'\bNo\.1\b': 'NO.1',
|
||||
r'\bNO2\b': 'NO.2',
|
||||
r'\bNO\.2\b': 'NO.2',
|
||||
r'\bNo\.2\b': 'NO.2',
|
||||
r'\bNO3\b': 'NO.3',
|
||||
r'\bNO\.3\b': 'NO.3',
|
||||
r'\bNo\.3\b': 'NO.3',
|
||||
r'\bNO4\b': 'NO.4',
|
||||
r'\bNO\.4\b': 'NO.4',
|
||||
r'\bNo\.4\b': 'NO.4',
|
||||
r'\bNO5\b': 'NO.5',
|
||||
r'\bNO\.5\b': 'NO.5',
|
||||
r'\bNo\.5\b': 'NO.5',
|
||||
r'\bFLTR\b': 'FILTER',
|
||||
r'\bLUB\.': 'LUBRICANT',
|
||||
r'\bM\.G\.O\b': 'MGO',
|
||||
r'\bMGO\b': 'MGO',
|
||||
r'\bF\.O\b': 'FUEL OIL',
|
||||
r'\bFO\b': 'FUEL OIL',
|
||||
r'\bL\.T\b': 'LOW TEMPERATURE',
|
||||
r'\bLT\b': 'LOW TEMPERATURE',
|
||||
r'\bH\.T\b': 'HIGH TEMPERATURE',
|
||||
r'\bHT\b': 'HIGH TEMPERATURE',
|
||||
r'\bAUX\.': 'AUXILIARY',
|
||||
r'\bNO\.2A\b': 'NO.2A',
|
||||
r'\bNO\.2B\b': 'NO.2B',
|
||||
r'\bAUX\.BOILER\b': 'AUXILIARY BOILER',
|
||||
r'\bAUX\. BOILER\b': 'AUXILIARY BOILER',
|
||||
r'\bWIND\.': 'WINDING',
|
||||
r'\bWINDING\b': 'WINDING',
|
||||
r'\bC\.S\.W\b': 'CSW',
|
||||
r'\bCSW\b': 'CSW',
|
||||
r'\bVLOT\.': 'VOLTAGE',
|
||||
r'\bVOLTAGE\b': 'VOLTAGE',
|
||||
r'\bVOLT\.': 'VOLTAGE',
|
||||
r'\bFREQ\.': 'FREQUENCY',
|
||||
r'\bFREQUENCY\b': 'FREQUENCY',
|
||||
r'\bCURR\.': 'CURRENT',
|
||||
r'\bCURRENT\b': 'CURRENT',
|
||||
r'\bH\.F\.O\.': 'HFO',
|
||||
r'\bTCA\b': 'TURBOCHARGER',
|
||||
r'\bTCB\b': 'TURBOCHARGER',
|
||||
r'\bT/C\b': 'TURBOCHARGER',
|
||||
r'\bTC\b': 'TURBOCHARGER',
|
||||
r'\bTURBOCHAGER\b': 'TURBOCHARGER',
|
||||
r'\bTURBOCHARGER\b': 'TURBOCHARGER'
|
||||
}
|
Loading…
Reference in New Issue