Chore: re-organized data_import directory to use .py files
Doc: added README.md explaining purpose of each file and instructions
This commit is contained in:
parent
24829c7abf
commit
67f3712ea6
10
README.md
10
README.md
|
@ -1 +1,11 @@
|
|||
# hipom_data_mapping
|
||||
|
||||
## Before we begin
|
||||
|
||||
This repository utilizes `.py` files rather than `.ipynb` for greater clarity.
|
||||
|
||||
If you use vscode, just use the ipython functionality from `.py` files.
|
||||
|
||||
In order to generate `.ipynb` file from `.py` file, you can do the following:
|
||||
|
||||
`jupytext --to notebook your_script.py`
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
db_connection_info.txt
|
||||
exports/*
|
||||
outputs/*
|
|
@ -0,0 +1,42 @@
|
|||
# Data Import
|
||||
|
||||
## What is this folder
|
||||
|
||||
This folder contains the files needed to import files from the remote database
|
||||
to local csv files.
|
||||
|
||||
This folder contains the following files:
|
||||
|
||||
- `select_db.py`:
|
||||
- use this to pull the raw datasets `data_mapping.csv` and
|
||||
`data_model_master_export.csv`
|
||||
- `make_csv.py`:
|
||||
- perform basic processing
|
||||
- produces the following files:
|
||||
- `raw_data.csv`: `data_mapping.csv` without some fields
|
||||
- `data_mapping_mdm.csv`: mdm subset of `raw_data.csv`
|
||||
- `make_figures` sub-directory
|
||||
- `plot_class_token.ipynb`: get number of thing-property combinations, and
|
||||
plot the histogram of thing-property counts along with the tag_description
|
||||
character counts
|
||||
- `plot_count.ipynb`: get counts of ship-data and platform-data
|
||||
- `exports` sub-directory:
|
||||
- this folder stores the files that were produced from import
|
||||
- `outputs` sub-directory:
|
||||
- this folder stores the exported figures from `make_figures`
|
||||
|
||||
## Instructions
|
||||
|
||||
Check the following:
|
||||
|
||||
- Remember to activate your python environment
|
||||
- Ensure that the `db_connection_info.txt` is linked to this directory
|
||||
- e.g. `ln -s /some/directory/db_connection_info.txt .`
|
||||
|
||||
To import data, execute the following:
|
||||
|
||||
- `cd` into this folder.
|
||||
- `python select_db.py`
|
||||
- `python make_csv.py`
|
||||
|
||||
Export files will be found in `exports`. This helps to keep the folder clean.
|
|
@ -2,9 +2,9 @@ import pandas as pd
|
|||
import re
|
||||
|
||||
# Load the data_mapping CSV file
|
||||
data_mapping_file_path = 'data_import/data_mapping.csv' # Adjust this path to your actual file location
|
||||
data_mapping_file_path = 'exports/data_mapping.csv' # Adjust this path to your actual file location
|
||||
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
|
||||
df_master = pd.read_csv('data_import/data_model_master_export.csv')
|
||||
df_master = pd.read_csv('exports/data_model_master_export.csv')
|
||||
|
||||
# Generate patterns
|
||||
data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)
|
||||
|
@ -23,7 +23,7 @@ fields_to_remove = ['equip_type_code', 'tx_period', 'tx_type', 'on_change_yn', '
|
|||
merged_data = data_mapping.drop(columns=fields_to_remove)
|
||||
|
||||
# Save the updated DataFrame to a new CSV file
|
||||
output_file_path = 'data_import/raw_data.csv'
|
||||
output_file_path = 'exports/raw_data.csv'
|
||||
merged_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')
|
||||
|
||||
print(f"Updated data saved to {output_file_path}")
|
||||
|
@ -32,7 +32,7 @@ print(f"Updated data saved to {output_file_path}")
|
|||
data_mapping_mdm_true = merged_data[merged_data['MDM']]
|
||||
|
||||
# Save the filtered DataFrame to a new CSV file
|
||||
mdm_true_output_file_path = 'data_import/data_mapping_mdm.csv'
|
||||
mdm_true_output_file_path = 'exports/data_mapping_mdm.csv'
|
||||
data_mapping_mdm_true.to_csv(mdm_true_output_file_path, index=False, encoding='utf-8-sig')
|
||||
|
||||
print(f"MDM TRUE data saved to {mdm_true_output_file_path}")
|
|
@ -0,0 +1,3 @@
|
|||
plot_class_token.ipynb
|
||||
plot_count.ipynb
|
||||
|
|
@ -0,0 +1,139 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.16.4
|
||||
# kernelspec:
|
||||
# display_name: torch
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %%
|
||||
import pandas as pd
|
||||
import os
|
||||
import re
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# CSV 파일 읽기
|
||||
df = pd.read_csv('../exports/raw_data.csv')
|
||||
|
||||
# MDM이 True인 데이터만 필터링
|
||||
mdm_true_df = df[df['MDM'] == True]
|
||||
|
||||
# 'thing'과 'property'로 그룹화하여 'tag_description'을 이어붙이기 (NaN을 빈 문자열로 처리)
|
||||
tag_description_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_description'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
|
||||
|
||||
# 'thing'과 'property'로 그룹화하여 'tag_name'을 이어붙이기 (NaN을 빈 문자열로 처리)
|
||||
tag_name_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_name'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
|
||||
|
||||
# 'thing'과 'property'의 매핑 개수 계산
|
||||
mapping_count = mdm_true_df.groupby(['thing', 'property']).size().reset_index(name='mapping_count')
|
||||
|
||||
# 세 개의 데이터프레임 병합: mapping_count, tag_description_concatenated, tag_name_concatenated
|
||||
thing_property_grouped = pd.merge(mapping_count, tag_description_concatenated, on=['thing', 'property'])
|
||||
thing_property_grouped = pd.merge(thing_property_grouped, tag_name_concatenated, on=['thing', 'property'])
|
||||
|
||||
# 'tag_description'에서 공백으로 분리된 토큰 수 계산
|
||||
thing_property_grouped['td_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(re.findall(r'\S+', x)))
|
||||
|
||||
# 'tag_description'에서 고유한 토큰 수 계산 (unique_token_count)
|
||||
thing_property_grouped['unique_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(set(re.findall(r'\S+', x))))
|
||||
|
||||
# 'thing'과 'property'에서 숫자를 '#'으로 대체하여 pattern 생성
|
||||
thing_property_grouped['pattern'] = thing_property_grouped['thing'].str.replace(r'\d', '#', regex=True) + " " + thing_property_grouped['property'].str.replace(r'\d', '#', regex=True)
|
||||
|
||||
# 고유한 thing_property 조합의 총 개수 계산
|
||||
total_thing_property_count = thing_property_grouped.shape[0]
|
||||
|
||||
# 저장 경로 지정
|
||||
output_path = '../outputs/thing_property_grouped.csv'
|
||||
|
||||
# 디렉터리 생성 (존재하지 않으면)
|
||||
output_dir = os.path.dirname(output_path)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 결과를 CSV 파일로 저장
|
||||
thing_property_grouped.to_csv(output_path, index=False, encoding='utf-8-sig')
|
||||
|
||||
# 결과 출력
|
||||
print(f"Concatenated data saved to {output_path}")
|
||||
print(f"Total number of unique thing_property combinations: {total_thing_property_count}")
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# Left axis: Plotting the histogram for mapping_count
|
||||
fig, ax1 = plt.subplots(figsize=(12, 8))
|
||||
|
||||
# Histogram for mapping_count
|
||||
ax1.hist(thing_property_grouped['mapping_count'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
|
||||
ax1.set_xlabel('Mapping Count', fontsize=24, color='black') # X-axis label with larger font
|
||||
ax1.set_ylabel('Frequency', fontsize=24, color='black') # Y-axis label with larger font
|
||||
ax1.grid(True, linestyle='--', alpha=0.7)
|
||||
|
||||
# Set axis color to black
|
||||
ax1.spines['bottom'].set_color('black')
|
||||
ax1.spines['top'].set_color('black')
|
||||
ax1.spines['right'].set_color('black')
|
||||
ax1.spines['left'].set_color('black')
|
||||
|
||||
# Make tick labels larger
|
||||
ax1.tick_params(axis='x', colors='black', labelsize=18)
|
||||
ax1.tick_params(axis='y', colors='black', labelsize=18)
|
||||
|
||||
# Right axis: Plotting unique_token_count min, max, and average
|
||||
ax2 = ax1.twinx()
|
||||
|
||||
# Group by mapping_count to calculate min, max, and average of unique_token_count
|
||||
grouped_token_stats = thing_property_grouped.groupby('mapping_count')['unique_token_count'].agg(['min', 'max', 'mean']).reset_index()
|
||||
|
||||
# Plot the min-max range as a shaded area
|
||||
ax2.fill_between(grouped_token_stats['mapping_count'],
|
||||
grouped_token_stats['min'],
|
||||
grouped_token_stats['max'],
|
||||
color='lightgray', alpha=0.5, label='Min-Max Range')
|
||||
|
||||
# Plot the average unique_token_count as a line
|
||||
ax2.plot(grouped_token_stats['mapping_count'],
|
||||
grouped_token_stats['mean'],
|
||||
color='red', marker='o', linestyle='-', label='Average Unique Token Count')
|
||||
|
||||
ax2.set_ylabel('Unique Token Count (Min/Max/Avg)', fontsize=24, color='black') # Larger font for right Y-axis label
|
||||
ax2.tick_params(axis='y', colors='black', labelsize=18)
|
||||
|
||||
# Add legends
|
||||
ax1.legend(['Frequency'], loc='upper left', fontsize=18)
|
||||
ax2.legend(loc='upper right', fontsize=18)
|
||||
|
||||
# Add a logarithmic trendline
|
||||
# Applying log to mapping_count for the trendline
|
||||
log_mapping_count = np.log(grouped_token_stats['mapping_count'])
|
||||
|
||||
# Fit a linear model on the log of the mapping_count
|
||||
z = np.polyfit(log_mapping_count, grouped_token_stats['mean'], 1) # Linear fit on log-transformed data
|
||||
p = np.poly1d(z)
|
||||
|
||||
# Generate x values and corresponding y values for the trendline
|
||||
x_vals = np.linspace(grouped_token_stats['mapping_count'].min(), grouped_token_stats['mapping_count'].max(), 500)
|
||||
log_x_vals = np.log(x_vals)
|
||||
y_vals = p(log_x_vals)
|
||||
|
||||
# Plot the logarithmic trendline
|
||||
ax2.plot(x_vals, y_vals, color='green', linestyle='--', label='Logarithmic Trendline')
|
||||
|
||||
# Add the trendline to the legend
|
||||
ax2.legend(loc='upper right', fontsize=18)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
plt.savefig('../outputs/thing-property_histogram_with_char_count.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
# %%
|
|
@ -0,0 +1,143 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.16.4
|
||||
# kernelspec:
|
||||
# display_name: hug
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# %%
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
# note: we assume that you will execute from the directory of this code
|
||||
# check your current directory
|
||||
print("Current Working Directory:", os.getcwd())
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
|
||||
# 전체 글꼴 크기 설정
|
||||
plt.rcParams.update({'font.size': 18})
|
||||
|
||||
# CSV 파일 읽기
|
||||
df = pd.read_csv('../exports/raw_data.csv')
|
||||
|
||||
# ships_idx 별 전체 갯수 계산
|
||||
total_counts = df['ships_idx'].value_counts().sort_index()
|
||||
|
||||
# ships_idx 별 MDM=True 인 갯수 계산
|
||||
mdm_true_counts = df[df['MDM'] == True]['ships_idx'].value_counts().sort_index()
|
||||
|
||||
# 데이터프레임으로 합치기
|
||||
summary_df = pd.DataFrame({
|
||||
'SD': total_counts,
|
||||
'PD': mdm_true_counts
|
||||
}).fillna(0) # NaN 값을 0으로 대체
|
||||
|
||||
# SD와 PD의 총 갯수 계산
|
||||
total_SD = summary_df['SD'].sum()
|
||||
total_PD = summary_df['PD'].sum()
|
||||
|
||||
# 총 갯수 출력
|
||||
print(f"Total SD: {total_SD}")
|
||||
print(f"Total PD: {total_PD}")
|
||||
|
||||
# 시각화
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
# Total Counts 먼저 그리기 (굵은 막대로 설정)
|
||||
summary_df['SD'].plot(kind='barh', ax=ax, color='orange', alpha=0.5, label='SD', width=0.8) # 막대 폭을 넓게 설정
|
||||
|
||||
# MDM=True Counts를 그 위에 겹쳐서 그리기 (굵은 막대로 설정)
|
||||
summary_df['PD'].plot(kind='barh', ax=ax, color='blue', alpha=0.7, label='PD', width=0.8) # 막대 폭을 넓게 설정
|
||||
|
||||
# y축 라벨을 10 단위로 설정
|
||||
y_labels = ax.get_yticks()
|
||||
ax.set_yticks(np.arange(min(y_labels), max(y_labels) + 1, 10))
|
||||
ax.set_yticklabels([int(label) for label in np.arange(min(y_labels), max(y_labels) + 1, 10)])
|
||||
|
||||
# 그리드 추가
|
||||
ax.grid(True)
|
||||
|
||||
# 범례와 제목 설정
|
||||
plt.legend(prop={'size': 18}) # 레전드 글꼴 크기 설정
|
||||
plt.xlabel('Counts')
|
||||
plt.ylabel('Ships')
|
||||
|
||||
|
||||
# save to outputs
|
||||
# Save the plot to the specified folder
|
||||
plt.savefig('../outputs/count_statistics_of_each_ship.png')
|
||||
|
||||
# 그래프 출력
|
||||
plt.show()
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# SD와 PD의 총 갯수 계산
|
||||
total_SD = summary_df['SD'].sum() # SD의 총 갯수
|
||||
total_PD = summary_df['PD'].sum() # PD의 총 갯수
|
||||
|
||||
# tag_description의 글자수 계산
|
||||
df['tag_description_length'] = df['tag_description'].astype(str).apply(len)
|
||||
|
||||
# tag_description의 평균 글자수 계산
|
||||
mean_tag_description_length = df['tag_description_length'].mean()
|
||||
|
||||
# 결과 출력
|
||||
print(f"Tag Description의 평균 글자수: {mean_tag_description_length:.2f}")
|
||||
|
||||
# 글자수 분포를 히스토그램으로 시각화
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(df['tag_description_length'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
|
||||
plt.title('Distribution of Tag Description Lengths')
|
||||
plt.xlabel('Tag Description Length (characters)')
|
||||
plt.ylabel('Frequency')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
# tag_description을 ' '로 split한 후 토큰 수 계산
|
||||
df['tag_description_tokens'] = df['tag_description'].astype(str).apply(lambda x: len(x.split(' ')))
|
||||
|
||||
# tag_description의 평균 토큰 수 계산
|
||||
mean_tag_description_tokens = df['tag_description_tokens'].mean()
|
||||
|
||||
# 결과 출력
|
||||
print(f"Tag Description의 평균 토큰 수: {mean_tag_description_tokens:.2f}")
|
||||
|
||||
# 토큰 수 분포를 히스토그램으로 시각화
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(df['tag_description_tokens'], bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
|
||||
plt.title('Distribution of Tag Description Tokens')
|
||||
plt.xlabel('Number of Tokens')
|
||||
plt.ylabel('Frequency')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
# 전체 텍스트에서 모든 토큰 수와 고유 토큰 수 계산
|
||||
all_tokens = df['tag_description'].astype(str).apply(lambda x: x.split(' ')).sum() # 전체 토큰 리스트
|
||||
unique_tokens = set(all_tokens) # 고유 토큰 집합
|
||||
|
||||
# 전체 토큰 수와 고유 토큰 수 계산
|
||||
total_token_count = len(all_tokens)
|
||||
unique_token_count = len(unique_tokens)
|
||||
|
||||
# 결과 출력
|
||||
print(f"전체 토큰 수: {total_token_count}")
|
||||
print(f"고유 토큰 수: {unique_token_count}")
|
||||
|
||||
|
||||
# %%
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -38,7 +38,7 @@ try:
|
|||
results_mapping = cursor.fetchall()
|
||||
columns_mapping = [desc[0] for desc in cursor.description]
|
||||
df_mapping = pd.DataFrame(results_mapping, columns=columns_mapping)
|
||||
df_mapping.to_csv('data_import/data_mapping.csv', index=False, encoding='utf-8-sig')
|
||||
df_mapping.to_csv('exports/data_mapping.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
# Export data_master_model table
|
||||
query_master = """
|
||||
|
@ -48,9 +48,9 @@ try:
|
|||
results_master = cursor.fetchall()
|
||||
columns_master = [desc[0] for desc in cursor.description]
|
||||
df_master = pd.DataFrame(results_master, columns=columns_master)
|
||||
df_master.to_csv('data_import/data_model_master_export.csv', index=False, encoding='utf-8-sig')
|
||||
df_master.to_csv('exports/data_model_master_export.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
print("Data exported successfully to 'data_import/data_mapping.csv' and 'data_import/data_model_master_export.csv'")
|
||||
print("Data exported successfully to 'exports/data_mapping.csv' and 'exports/data_model_master_export.csv'")
|
||||
|
||||
except (Exception, psycopg2.DatabaseError) as error:
|
||||
print(f"An error occurred: {error}")
|
Loading…
Reference in New Issue