hipom_data_mapping/analysis/units/unit_analysis.py

78 lines
1.9 KiB
Python

# we want to compare the labels between the train data and test data
# %%
import pandas as pd
# %%
file_path = '../../data_import/exports/raw_data.csv' # Adjust this path to your actual file location
df = pd.read_csv(file_path)
df = df[df['MDM']]
# %%
unit_list = df['unit']
unit_list = [elem if (isinstance(elem, str)) else '' for elem in unit_list]
print(sorted(list(set(unit_list))))
# %%
test = ''
# df[df['unit'] == test]['property_pattern'].to_list()
df[df['unit'] == test]
#############
# 1 import test data
# %%
fold = 1
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
# %%
# subset to mdm
df = df[df['MDM']]
thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
test_df = df
# %%
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
##########################
# 2 import train data
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path)
train_pattern = train_df['pattern']
# %%
test_pattern_set = set(test_pattern)
train_pattern_set = set(train_pattern)
# %%
# use this to get labels in test not found in training data
test_pattern_set - train_pattern_set
# verdict: we see that FOMassFlowTotal is not found in the training set
# hence it is not possible for this to be classified correctly
###################################
# experiment 2
# %%
# we want to check load and loadpercent
test_df[test_df['property_pattern'] == 'Load']
# %%
test_df[test_df['property_pattern'] == 'LoadPercent']
#
set(df['unit'])
# %%