hipom_data_mapping/analysis/categories/label_inconsistencies.py

# we want to compare the labels between the train data and test data
# %%
import pandas as pd

#########################
# experiment 1

#############
# 1 import test data
# %%
fold = 1
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)

# %%
# subset to mdm
df = df[df['MDM']]

thing_condition = df['p_thing'] == df['thing_pattern']
error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]

property_condition = df['p_property'] == df['property_pattern']
error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]

correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]

test_df = df


# %%
test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']

##########################
# 2 import train data
# %%
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path)

train_pattern = train_df['pattern']


# %%
test_pattern_set = set(test_pattern)
train_pattern_set = set(train_pattern)

# %%
# use this to get labels in test not found in training data
test_pattern_set - train_pattern_set

# verdict: we see that FOMassFlowTotal is not found in the training set
# hence it is not possible for this to be classified correctly


###################################
# experiment 2
# %%
# we want to check load and loadpercent
test_df[test_df['property_pattern'] == 'Load']
# %%
test_df[test_df['property_pattern'] == 'LoadPercent']

# verdict: we see that the units column determine what this should be
# in order to not disturb the model, we should chuck it in to post-process
Feat: added abbreviation expansion rules 2024-11-10 20:28:47 +09:00			`# we want to compare the labels between the train data and test data`
			`# %%`
			`import pandas as pd`

			`#########################`
			`# experiment 1`

			`#############`
			`# 1 import test data`
			`# %%`
			`fold = 1`
			`data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'`
			`df = pd.read_csv(data_path, skipinitialspace=True)`

			`# %%`
			`# subset to mdm`
			`df = df[df['MDM']]`

			`thing_condition = df['p_thing'] == df['thing_pattern']`
			`error_thing_df = df[~thing_condition][['tag_description', 'thing_pattern','p_thing']]`

			`property_condition = df['p_property'] == df['property_pattern']`
			`error_property_df = df[~property_condition][['tag_description', 'property_pattern','p_property']]`

			`correct_df = df[thing_condition & property_condition][['tag_description', 'property_pattern', 'p_property']]`

			`test_df = df`


			`# %%`
			`test_pattern = df['thing_pattern'] + ' ' + df['property_pattern']`

			`##########################`
			`# 2 import train data`
			`# %%`
			`data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"`
			`train_df = pd.read_csv(data_path)`

			`train_pattern = train_df['pattern']`


			`# %%`
			`test_pattern_set = set(test_pattern)`
			`train_pattern_set = set(train_pattern)`

			`# %%`
			`# use this to get labels in test not found in training data`
			`test_pattern_set - train_pattern_set`

			`# verdict: we see that FOMassFlowTotal is not found in the training set`
			`# hence it is not possible for this to be classified correctly`


			`###################################`
			`# experiment 2`
			`# %%`
			`# we want to check load and loadpercent`
			`test_df[test_df['property_pattern'] == 'Load']`
			`# %%`
			`test_df[test_df['property_pattern'] == 'LoadPercent']`

			`# verdict: we see that the units column determine what this should be`
			`# in order to not disturb the model, we should chuck it in to post-process`