2024-10-30 11:01:57 +09:00
|
|
|
"""
|
|
|
|
Author: Daniel Kim
|
|
|
|
Modified by: Richard Wong
|
|
|
|
"""
|
|
|
|
# %%
|
|
|
|
import re
|
|
|
|
import pandas as pd
|
2024-11-25 18:15:28 +09:00
|
|
|
from replacement_dict_new import desc_replacement_dict, unit_replacement_dict
|
2024-10-30 11:01:57 +09:00
|
|
|
|
|
|
|
# %%
|
|
|
|
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
|
|
|
|
"""Count the number of occurrences of the abbreviation in the list of machine descriptions."""
|
|
|
|
pattern = re.compile(abbreviation)
|
|
|
|
count = sum(len(pattern.findall(description)) for description in tag_descriptions)
|
|
|
|
return count
|
|
|
|
|
|
|
|
def replace_abbreviations(tag_descriptions, abbreviations):
|
|
|
|
"""Replace the abbreviations according to the key-pair value provided."""
|
|
|
|
replaced_descriptions = []
|
|
|
|
for description in tag_descriptions:
|
|
|
|
for abbreviation, replacement in abbreviations.items():
|
|
|
|
description = re.sub(abbreviation, replacement, description)
|
|
|
|
|
|
|
|
replaced_descriptions.append(description)
|
|
|
|
return replaced_descriptions
|
|
|
|
|
2024-11-10 20:28:47 +09:00
|
|
|
def cleanup_spaces(tag_descriptions):
|
|
|
|
# Replace all whitespace with a single space
|
|
|
|
replaced_descriptions = []
|
|
|
|
for description in tag_descriptions:
|
|
|
|
description_clean = re.sub(r'\s+', ' ', description)
|
|
|
|
replaced_descriptions.append(description_clean)
|
|
|
|
return replaced_descriptions
|
|
|
|
|
|
|
|
# remove all dots
|
|
|
|
def cleanup_dots(tag_descriptions):
|
|
|
|
replaced_descriptions = []
|
|
|
|
for description in tag_descriptions:
|
|
|
|
description_clean = re.sub(r'\.', '', description)
|
|
|
|
replaced_descriptions.append(description_clean)
|
|
|
|
return replaced_descriptions
|
|
|
|
|
2024-10-30 11:01:57 +09:00
|
|
|
|
|
|
|
# %%
|
|
|
|
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
|
|
|
df = pd.read_csv(file_path)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# Replace abbreviations
|
2024-11-10 20:28:47 +09:00
|
|
|
print("running substitution for descriptions")
|
2024-11-25 18:15:28 +09:00
|
|
|
# normalize to uppercase
|
|
|
|
# strip leading and trailing whitespace
|
|
|
|
df['tag_description'] = df['tag_description'].str.strip()
|
|
|
|
df['tag_description'] = df['tag_description'].str.upper()
|
2024-11-05 16:49:18 +09:00
|
|
|
# Replace whitespace-only entries with "NOVALUE"
|
|
|
|
# note that "N/A" can be read as nan
|
2024-11-10 20:28:47 +09:00
|
|
|
# replace whitespace only values as NOVALUE
|
2024-11-25 18:15:28 +09:00
|
|
|
df['tag_description']= df['tag_description'].fillna("NOVALUE")
|
2024-11-05 16:49:18 +09:00
|
|
|
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
|
2024-11-25 18:15:28 +09:00
|
|
|
|
|
|
|
# perform actual substitution
|
2024-11-05 16:49:18 +09:00
|
|
|
tag_descriptions = df['tag_description']
|
2024-11-10 20:28:47 +09:00
|
|
|
replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
|
|
|
|
replaced_descriptions = cleanup_spaces(replaced_descriptions)
|
|
|
|
replaced_descriptions = cleanup_dots(replaced_descriptions)
|
|
|
|
df["tag_description"] = replaced_descriptions
|
2024-10-30 11:01:57 +09:00
|
|
|
# print("Descriptions after replacement:", replaced_descriptions)
|
|
|
|
|
|
|
|
# %%
|
2024-11-10 20:28:47 +09:00
|
|
|
print("running substitutions for units")
|
|
|
|
df['unit'] = df['unit'].fillna("NOVALUE")
|
|
|
|
df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
|
|
|
|
unit_list = df['unit']
|
|
|
|
new_unit = replace_abbreviations(unit_list, unit_replacement_dict)
|
|
|
|
new_unit = cleanup_spaces(new_unit)
|
|
|
|
df['unit'] = new_unit
|
|
|
|
|
|
|
|
|
|
|
|
# save
|
2024-10-30 11:01:57 +09:00
|
|
|
df.to_csv("../exports/preprocessed_data.csv", index=False)
|
|
|
|
print("file saved")
|