diff --git a/data_preprocess/abbreviations/.gitignore b/data_preprocess/abbreviations/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/data_preprocess/abbreviations/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/data_preprocess/abbreviations/README.md b/data_preprocess/abbreviations/README.md new file mode 100644 index 0000000..b4dbe38 --- /dev/null +++ b/data_preprocess/abbreviations/README.md @@ -0,0 +1,7 @@ +## Purpose: + +Perform substitutions on common terms to standardize abbreviations. + +## Instructions: + +- `python abbreviations_replacer.py` \ No newline at end of file diff --git a/data_preprocess/abbreviations/abbreviations_replacer.py b/data_preprocess/abbreviations/abbreviations_replacer.py new file mode 100644 index 0000000..2a40b27 --- /dev/null +++ b/data_preprocess/abbreviations/abbreviations_replacer.py @@ -0,0 +1,43 @@ +""" +Author: Daniel Kim +Modified by: Richard Wong +""" +# %% +import re +import pandas as pd +from replacement_dict import replacement_dict + +# %% +def count_abbreviation_occurrences(tag_descriptions, abbreviation): + """Count the number of occurrences of the abbreviation in the list of machine descriptions.""" + pattern = re.compile(abbreviation) + count = sum(len(pattern.findall(description)) for description in tag_descriptions) + return count + +def replace_abbreviations(tag_descriptions, abbreviations): + """Replace the abbreviations according to the key-pair value provided.""" + replaced_descriptions = [] + for description in tag_descriptions: + for abbreviation, replacement in abbreviations.items(): + description = re.sub(abbreviation, replacement, description) + + replaced_descriptions.append(description) + return replaced_descriptions + + +# %% +file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location +df = pd.read_csv(file_path) + +# %% +# Replace abbreviations +print("running substitution") +tag_descriptions = df['tag_description'].fillna("N/A") +replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict) +# print("Descriptions after replacement:", replaced_descriptions) + +# %% +df["tag_description"] = replaced_descriptions +df.to_csv("../exports/preprocessed_data.csv", index=False) +print("file saved") +# %% diff --git a/data_preprocess/abbreviations/replacement_dict.py b/data_preprocess/abbreviations/replacement_dict.py new file mode 100644 index 0000000..0475910 --- /dev/null +++ b/data_preprocess/abbreviations/replacement_dict.py @@ -0,0 +1,113 @@ +""" +Author: Daniel Kim +""" + +# Abbreviations and their replacements +replacement_dict = { + r'\bLIST\b': 'LIST', + r'\bList\b': 'LIST', + r'\bEXH\.': 'EXHAUST', + r'\bEXH\b': 'EXHAUST', + r'\bEXHAUST\.': 'EXHAUST', + r'\bExhaust\b': 'EXHAUST', + r'\bEXHAUST\b': 'EXHAUST', + r'\bTEMP\.': 'TEMPERATURE', + r'\bTEMP\b': 'TEMPERATURE', + r'\bTEMPERATURE\.': 'TEMPERATURE', + r'\bTEMPERATURE\b': 'TEMPERATURE', + r'\bW\.': 'WATER', + r'\bWATER\b': 'WATER', + r'\bCYL\.': 'CYLINDER', + r'\bcyl\.': 'CYLINDER', + r'\bCYL\b': 'CYLINDER', + r'\bcylinder\b': 'CYLINDER', + r'\bCYLINDER\b': 'CYLINDER', + r'\bCOOL\.': 'COOLING', + r'\bcool\.': 'COOLING', + r'\bcooling\b': 'COOLING', + r'\bCOOLING\b': 'COOLING', + r'\bcooler\b': 'COOLER', + r'\bCOOLER\b': 'COOLER', + r'\bScav\.': 'SCAVENGE', + r'\bSCAV\.': 'SCAVENGE', + r'\bINL\.': 'INLET', + r'\binlet\b': 'INLET', + r'\bINLET\b': 'INLET', + r'\bOUT\.': 'OUTLET', + r'\bOUTL\.': 'OUTLET', + r'\boutlet\b': 'OUTLET', + r'\bOUTLET\b': 'OUTLET', + r'\bPRESS\.': 'PRESSURE', + r'\bPress\.': 'PRESSURE', + r'\bpressure\b': 'PRESSURE', + r'\bPRESSURE\b': 'PRESSURE', + r'\bCLR\b': 'CLEAR', + r'\bENG\.': 'ENGINE', + r'\bENG\b': 'ENGINE', + r'\bENGINE\b': 'ENGINE', + r'\bEngine speed\b': 'ENGINE SPEED', + r'\bEngine running\b': 'ENGINE RUNNING', + r'\bEngine RPM pickup\b': 'ENGINE RPM PICKUP', + r'\bEngine room\b': 'ENGINE ROOM', + r'\bM/E\b': 'MAIN_ENGINE', + r'\bME\b': 'MAIN_ENGINE', + r'\bMAIN ENGINE\b': 'MAIN_ENGINE', + r'\bGen\b': 'GENERATOR_ENGINE', + r'\bGE\b': 'GENERATOR_ENGINE', + r'\bG/E\b': 'GENERATOR_ENGINE', + r'\bDG': 'GENERATOR_ENGINE', + r'\bD/G\b': 'GENERATOR_ENGINE', + r'\bGEN\.': 'GENERATOR_ENGINE', + r'\bGENERATOR ENGINE\B': 'GENERATOR_ENGINE', + r'\bGEN\.WIND\.TEMP\b': 'GENERATOR WINDING TEMPERATURE', + r'\bENGINE ROOM\b': 'ENGINE ROOM', + r'\bE/R\b': 'ENGINE ROOM', + r'\bNO1\b': 'NO.1', + r'\bNO\.1\b': 'NO.1', + r'\bNo\.1\b': 'NO.1', + r'\bNO2\b': 'NO.2', + r'\bNO\.2\b': 'NO.2', + r'\bNo\.2\b': 'NO.2', + r'\bNO3\b': 'NO.3', + r'\bNO\.3\b': 'NO.3', + r'\bNo\.3\b': 'NO.3', + r'\bNO4\b': 'NO.4', + r'\bNO\.4\b': 'NO.4', + r'\bNo\.4\b': 'NO.4', + r'\bNO5\b': 'NO.5', + r'\bNO\.5\b': 'NO.5', + r'\bNo\.5\b': 'NO.5', + r'\bFLTR\b': 'FILTER', + r'\bLUB\.': 'LUBRICANT', + r'\bM\.G\.O\b': 'MGO', + r'\bMGO\b': 'MGO', + r'\bF\.O\b': 'FUEL OIL', + r'\bFO\b': 'FUEL OIL', + r'\bL\.T\b': 'LOW TEMPERATURE', + r'\bLT\b': 'LOW TEMPERATURE', + r'\bH\.T\b': 'HIGH TEMPERATURE', + r'\bHT\b': 'HIGH TEMPERATURE', + r'\bAUX\.': 'AUXILIARY', + r'\bNO\.2A\b': 'NO.2A', + r'\bNO\.2B\b': 'NO.2B', + r'\bAUX\.BOILER\b': 'AUXILIARY BOILER', + r'\bAUX\. BOILER\b': 'AUXILIARY BOILER', + r'\bWIND\.': 'WINDING', + r'\bWINDING\b': 'WINDING', + r'\bC\.S\.W\b': 'CSW', + r'\bCSW\b': 'CSW', + r'\bVLOT\.': 'VOLTAGE', + r'\bVOLTAGE\b': 'VOLTAGE', + r'\bVOLT\.': 'VOLTAGE', + r'\bFREQ\.': 'FREQUENCY', + r'\bFREQUENCY\b': 'FREQUENCY', + r'\bCURR\.': 'CURRENT', + r'\bCURRENT\b': 'CURRENT', + r'\bH\.F\.O\.': 'HFO', + r'\bTCA\b': 'TURBOCHARGER', + r'\bTCB\b': 'TURBOCHARGER', + r'\bT/C\b': 'TURBOCHARGER', + r'\bTC\b': 'TURBOCHARGER', + r'\bTURBOCHAGER\b': 'TURBOCHARGER', + r'\bTURBOCHARGER\b': 'TURBOCHARGER' +} \ No newline at end of file