63 lines
1.7 KiB
Python
63 lines
1.7 KiB
Python
|
# we want to compare the labels between the train data and test data
|
||
|
# %%
|
||
|
import pandas as pd
|
||
|
|
||
|
#########################
|
||
|
# experiment 1
|
||
|
|
||
|
#############
|
||
|
# 1 import test data
|
||
|
# %%
|
||
|
fold = 1
|
||
|
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
|
||
|
df = pd.read_csv(data_path, skipinitialspace=True)
|
||
|
|
||
|
# %%
|
||
|
# subset to mdm
|
||
|
df = df[df['MDM']]
|
||
|
|
||
|
thing_condition = df['p_thing'] == df['thing_pattern']
|
||
|
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]
|
||
|
|
||
|
property_condition = df['p_property'] == df['property_pattern']
|
||
|
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]
|
||
|
|
||
|
correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]
|
||
|
|
||
|
test_df = df
|
||
|
|
||
|
|
||
|
# %%
|
||
|
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']
|
||
|
|
||
|
##########################
|
||
|
# 2 import train data
|
||
|
# %%
|
||
|
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
|
||
|
train_df = pd.read_csv(data_path)
|
||
|
|
||
|
train_pattern = train_df['pattern']
|
||
|
|
||
|
|
||
|
# %%
|
||
|
test_pattern_set = set(test_pattern)
|
||
|
train_pattern_set = set(train_pattern)
|
||
|
|
||
|
# %%
|
||
|
# use this to get labels in test not found in training data
|
||
|
test_pattern_set - train_pattern_set
|
||
|
|
||
|
# verdict: we see that FOMassFlowTotal is not found in the training set
|
||
|
# hence it is not possible for this to be classified correctly
|
||
|
|
||
|
|
||
|
###################################
|
||
|
# experiment 2
|
||
|
# %%
|
||
|
# we want to check load and loadpercent
|
||
|
test_df[test_df['property_pattern'] == 'Load']
|
||
|
# %%
|
||
|
test_df[test_df['property_pattern'] == 'LoadPercent']
|
||
|
|
||
|
# verdict: we see that the units column determine what this should be
|
||
|
# in order to not disturb the model, we should chuck it in to post-process
|