44 lines
1.4 KiB
Python
44 lines
1.4 KiB
Python
|
"""
|
||
|
Author: Daniel Kim
|
||
|
Modified by: Richard Wong
|
||
|
"""
|
||
|
# %%
|
||
|
import re
|
||
|
import pandas as pd
|
||
|
from replacement_dict import replacement_dict
|
||
|
|
||
|
# %%
|
||
|
def count_abbreviation_occurrences(tag_descriptions, abbreviation):
|
||
|
"""Count the number of occurrences of the abbreviation in the list of machine descriptions."""
|
||
|
pattern = re.compile(abbreviation)
|
||
|
count = sum(len(pattern.findall(description)) for description in tag_descriptions)
|
||
|
return count
|
||
|
|
||
|
def replace_abbreviations(tag_descriptions, abbreviations):
|
||
|
"""Replace the abbreviations according to the key-pair value provided."""
|
||
|
replaced_descriptions = []
|
||
|
for description in tag_descriptions:
|
||
|
for abbreviation, replacement in abbreviations.items():
|
||
|
description = re.sub(abbreviation, replacement, description)
|
||
|
|
||
|
replaced_descriptions.append(description)
|
||
|
return replaced_descriptions
|
||
|
|
||
|
|
||
|
# %%
|
||
|
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
|
||
|
df = pd.read_csv(file_path)
|
||
|
|
||
|
# %%
|
||
|
# Replace abbreviations
|
||
|
print("running substitution")
|
||
|
tag_descriptions = df['tag_description'].fillna("N/A")
|
||
|
replaced_descriptions = replace_abbreviations(tag_descriptions, replacement_dict)
|
||
|
# print("Descriptions after replacement:", replaced_descriptions)
|
||
|
|
||
|
# %%
|
||
|
df["tag_description"] = replaced_descriptions
|
||
|
df.to_csv("../exports/preprocessed_data.csv", index=False)
|
||
|
print("file saved")
|
||
|
# %%
|