hipom_data_mapping/analysis/pattern_filling/analysis.py

56 lines
1.3 KiB
Python

# we want to see if there are clear rules to filling numbers in the pattern
# format
# %%
# %%
import pandas as pd
# from utils import Retriever, cosine_similarity_chunked
import os
import glob
import numpy as np
# %%
fold = 5
data_path = f'../../train/mapping_pattern/mapping_prediction/exports/result_group_{fold}.csv'
test_df = pd.read_csv(data_path, skipinitialspace=True)
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# %%
data_path = '../../data_import/exports/data_mapping_mdm.csv'
# data_path = '../../data_preprocess/exports/preprocessed_data.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
mdm_list = sorted(list((set(df['pattern']))))
# %%
symbol_pattern_list = [elem for elem in mdm_list if '#' in elem]
# %%
symbol_pattern_list
# %%
len(symbol_pattern_list)
# %%
idx = 22
print(symbol_pattern_list[idx])
condition1 = df['pattern'] == symbol_pattern_list[idx]
subset_df = df[df['pattern'] == symbol_pattern_list[idx]]
ship = list(set(subset_df['ships_idx']))
print(ship)
# %%
subset_df[['thing', 'property', 'tag_name', 'tag_description', 'ships_idx']].to_csv('output.csv')
# %%
ship_idx = 10
condition2 = df['ships_idx'] == ship_idx
subset_df = df[condition1 & condition2]
subset_df
# %%