hipom_data_mapping/data_preprocess/abbreviations/abbreviations_replacer.py

"""
Author: Daniel Kim
Modified by: Richard Wong
"""
# %%
import re
import pandas as pd
from replacement_dict import replacement_dict

# %%
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
    """Count the number of occurrences of the abbreviation in the list of machine descriptions."""
    pattern = re.compile(abbreviation)
    count = sum(len(pattern.findall(description)) for description in tag_descriptions)
    return count

def replace_abbreviations(tag_descriptions, abbreviations):
    """Replace the abbreviations according to the key-pair value provided."""
    replaced_descriptions = []
    for description in tag_descriptions:
        for abbreviation, replacement in abbreviations.items():
            description = re.sub(abbreviation, replacement, description)

        replaced_descriptions.append(description)
    return replaced_descriptions


# %%
file_path = '../../data_import/exports/raw_data.csv'  # Adjust this path to your actual file location
df = pd.read_csv(file_path)

# %%
# Replace abbreviations
print("running substitution")
df['tag_description']= df['tag_description'].fillna("NOVALUE")
# Replace whitespace-only entries with "NOVALUE"
# note that "N/A" can be read as nan
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
tag_descriptions = df['tag_description']
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict)
# print("Descriptions after replacement:", replaced_descriptions)

# %%
df["tag_description"] = replaced_descriptions
df.to_csv("../exports/preprocessed_data.csv", index=False)
print("file saved")