hipom_data_mapping/data_preprocess/abbreviations/abbreviations_replacer.py

"""
Author: Daniel Kim
Modified by: Richard Wong
"""
# %%
import re
import pandas as pd
from replacement_dict import desc_replacement_dict, unit_replacement_dict

# %%
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
    """Count the number of occurrences of the abbreviation in the list of machine descriptions."""
    pattern = re.compile(abbreviation)
    count = sum(len(pattern.findall(description)) for description in tag_descriptions)
    return count

def replace_abbreviations(tag_descriptions, abbreviations):
    """Replace the abbreviations according to the key-pair value provided."""
    replaced_descriptions = []
    for description in tag_descriptions:
        for abbreviation, replacement in abbreviations.items():
            description = re.sub(abbreviation, replacement, description)

        replaced_descriptions.append(description)
    return replaced_descriptions

def cleanup_spaces(tag_descriptions):
    # Replace all whitespace with a single space
    replaced_descriptions = []
    for description in tag_descriptions:
        description_clean = re.sub(r'\s+', ' ', description)
        replaced_descriptions.append(description_clean)
    return replaced_descriptions

# remove all dots
def cleanup_dots(tag_descriptions):
    replaced_descriptions = []
    for description in tag_descriptions:
        description_clean = re.sub(r'\.', '', description)
        replaced_descriptions.append(description_clean)
    return replaced_descriptions


# %%
file_path = '../../data_import/exports/raw_data.csv'  # Adjust this path to your actual file location
df = pd.read_csv(file_path)

# %%
# Replace abbreviations
print("running substitution for descriptions")
df['tag_description']= df['tag_description'].fillna("NOVALUE")
# Replace whitespace-only entries with "NOVALUE"
# note that "N/A" can be read as nan
# replace whitespace only values as NOVALUE
df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)
tag_descriptions = df['tag_description']
replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)
replaced_descriptions = cleanup_spaces(replaced_descriptions)
replaced_descriptions = cleanup_dots(replaced_descriptions)
df["tag_description"] = replaced_descriptions
# print("Descriptions after replacement:", replaced_descriptions)
# strip trailing whitespace
df['tag_description'] = df['tag_description'].str.rstrip()
df['tag_description'] = df['tag_description'].str.upper()

# %%
print("running substitutions for units")
df['unit'] = df['unit'].fillna("NOVALUE")
df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)
unit_list = df['unit']
new_unit = replace_abbreviations(unit_list, unit_replacement_dict)
new_unit = cleanup_spaces(new_unit)
df['unit'] = new_unit


# save
df.to_csv("../exports/preprocessed_data.csv", index=False)
print("file saved")
Feat: added Daniel's abbreviations preprocessing to preprocessing methods 2024-10-30 11:01:57 +09:00			`"""`
			`Author: Daniel Kim`
			`Modified by: Richard Wong`
			`"""`
			`# %%`
			`import re`
			`import pandas as pd`
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`from replacement_dict import desc_replacement_dict, unit_replacement_dict`
Feat: added Daniel's abbreviations preprocessing to preprocessing methods 2024-10-30 11:01:57 +09:00
			`# %%`
			`def count_abbreviation_occurrences(tag_descriptions, abbreviation):`
			`"""Count the number of occurrences of the abbreviation in the list of machine descriptions."""`
			`pattern = re.compile(abbreviation)`
			`count = sum(len(pattern.findall(description)) for description in tag_descriptions)`
			`return count`

			`def replace_abbreviations(tag_descriptions, abbreviations):`
			`"""Replace the abbreviations according to the key-pair value provided."""`
			`replaced_descriptions = []`
			`for description in tag_descriptions:`
			`for abbreviation, replacement in abbreviations.items():`
			`description = re.sub(abbreviation, replacement, description)`

			`replaced_descriptions.append(description)`
			`return replaced_descriptions`

Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`def cleanup_spaces(tag_descriptions):`
			`# Replace all whitespace with a single space`
			`replaced_descriptions = []`
			`for description in tag_descriptions:`
			`description_clean = re.sub(r'\s+', ' ', description)`
			`replaced_descriptions.append(description_clean)`
			`return replaced_descriptions`

			`# remove all dots`
			`def cleanup_dots(tag_descriptions):`
			`replaced_descriptions = []`
			`for description in tag_descriptions:`
			`description_clean = re.sub(r'\.', '', description)`
			`replaced_descriptions.append(description_clean)`
			`return replaced_descriptions`

Feat: added Daniel's abbreviations preprocessing to preprocessing methods 2024-10-30 11:01:57 +09:00
			`# %%`
			`file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location`
			`df = pd.read_csv(file_path)`

			`# %%`
			`# Replace abbreviations`
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`print("running substitution for descriptions")`
Feat: added classification methods Feat: added mapping to pattern-only method Chore: re-organized prediction to be within mapping folders 2024-11-05 16:49:18 +09:00			`df['tag_description']= df['tag_description'].fillna("NOVALUE")`
			`# Replace whitespace-only entries with "NOVALUE"`
			`# note that "N/A" can be read as nan`
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`# replace whitespace only values as NOVALUE`
Feat: added classification methods Feat: added mapping to pattern-only method Chore: re-organized prediction to be within mapping folders 2024-11-05 16:49:18 +09:00			`df['tag_description'] = df['tag_description'].replace(r'^\s*$', 'NOVALUE', regex=True)`
			`tag_descriptions = df['tag_description']`
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`replaced_descriptions = replace_abbreviations(tag_descriptions, desc_replacement_dict)`
			`replaced_descriptions = cleanup_spaces(replaced_descriptions)`
			`replaced_descriptions = cleanup_dots(replaced_descriptions)`
			`df["tag_description"] = replaced_descriptions`
Feat: added Daniel's abbreviations preprocessing to preprocessing methods 2024-10-30 11:01:57 +09:00			`# print("Descriptions after replacement:", replaced_descriptions)`
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`# strip trailing whitespace`
			`df['tag_description'] = df['tag_description'].str.rstrip()`
			`df['tag_description'] = df['tag_description'].str.upper()`
Feat: added Daniel's abbreviations preprocessing to preprocessing methods 2024-10-30 11:01:57 +09:00
			`# %%`
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`print("running substitutions for units")`
			`df['unit'] = df['unit'].fillna("NOVALUE")`
			`df['unit'] = df['unit'].replace(r'^\s*$', 'NOVALUE', regex=True)`
			`unit_list = df['unit']`
			`new_unit = replace_abbreviations(unit_list, unit_replacement_dict)`
			`new_unit = cleanup_spaces(new_unit)`
			`df['unit'] = new_unit`


			`# save`
Feat: added Daniel's abbreviations preprocessing to preprocessing methods 2024-10-30 11:01:57 +09:00			`df.to_csv("../exports/preprocessed_data.csv", index=False)`
			`print("file saved")`