""" Author: Daniel Kim Modified by: Richard Wong """ # %% import re import pandas as pd from replacement_dict_new import desc_replacement_dict, unit_replacement_dict # %% def count_abbreviation_occurrences(tag_descriptions, abbreviation): """Count the number of occurrences of the abbreviation in the list of machine descriptions.""" pattern = re.compile(abbreviation) count = sum(len(pattern.findall(description)) for description in tag_descriptions) return count def replace_abbreviations(tag_descriptions, abbreviations): """Replace the abbreviations according to the key-pair value provided.""" replaced_descriptions = [] for description in tag_descriptions: for abbreviation, replacement in abbreviations.items(): description = re.sub(abbreviation, replacement, description) replaced_descriptions.append(description) return replaced_descriptions def cleanup_spaces(tag_descriptions): # Replace all whitespace with a single space replaced_descriptions = [] for description in tag_descriptions: description_clean = re.sub(r'\s+', ' ', description) replaced_descriptions.append(description_clean) return replaced_descriptions # remove all dots def cleanup_dots(tag_descriptions): replaced_descriptions = [] for description in tag_descriptions: description_clean = re.sub(r'\.', '', description) replaced_descriptions.append(description_clean) return replaced_descriptions # %% file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location df = pd.read_csv(file_path) # %% # Replace abbreviations print("running substitution for descriptions") # normalize to uppercase # strip leading and trailing whitespace df['tag_description'] = df['tag_description'].str.strip() df['tag_description'] = df['tag_description'].str.upper() # Replace whitespace-only entries with "NOVALUE" # note that "N/A" can be read as nan # replace whitespace only values as NOVALUE df['tag_description']= df['tag_description'].fillna("NOVALUE") df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True) # perform actual substitution tag_descriptions = df['tag_description'] replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict) replaced_descriptions = cleanup_spaces(replaced_descriptions) replaced_descriptions = cleanup_dots(replaced_descriptions) df["tag_description"] = replaced_descriptions # print("Descriptions after replacement:", replaced_descriptions) # %% print("running substitutions for units") df['unit'] = df['unit'].fillna("NOVALUE") df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True) unit_list = df['unit'] new_unit = replace_abbreviations(unit_list, unit_replacement_dict) new_unit = cleanup_spaces(new_unit) df['unit'] = new_unit # save df.to_csv("../exports/preprocessed_data.csv", index=False) print("file saved")