Chore: re-organized data_import directory to use .py files

Doc: added README.md explaining purpose of each file and instructions
2024-10-29 20:07:51 +09:00 · 2024-10-29 20:07:51 +09:00 · 67f3712ea6
parent 24829c7abf
commit 67f3712ea6
11 changed files with 348 additions and 402 deletions
--- a/.gitignore
+++ b/.gitignore
--- a/README.md
+++ b/README.md
@ -1 +1,11 @@
 # hipom_data_mapping
+
+## Before we begin
+
+This repository utilizes `.py` files rather than `.ipynb` for greater clarity.
+
+If you use vscode, just use the ipython functionality from `.py` files.
+
+In order to generate `.ipynb` file from `.py` file, you can do the following:
+
+`jupytext --to notebook your_script.py`
--- a/data_import/.gitignore
+++ b/data_import/.gitignore
@ -0,0 +1,3 @@
+db_connection_info.txt
+exports/*
+outputs/*
--- a/data_import/README.md
+++ b/data_import/README.md
@ -0,0 +1,42 @@
+# Data Import
+
+## What is this folder
+
+This folder contains the files needed to import files from the remote database
+to local csv files.
+
+This folder contains the following files:
+
+- `select_db.py`:
+    - use this to pull the raw datasets `data_mapping.csv` and
+    `data_model_master_export.csv`
+- `make_csv.py`:
+    - perform basic processing
+    - produces the following files:
+        - `raw_data.csv`: `data_mapping.csv` without some fields
+        - `data_mapping_mdm.csv`: mdm subset of `raw_data.csv`
+- `make_figures` sub-directory
+    - `plot_class_token.ipynb`: get number of thing-property combinations, and
+    plot the histogram of thing-property counts along with the tag_description
+    character counts
+    - `plot_count.ipynb`: get counts of ship-data and platform-data
+- `exports` sub-directory:
+    - this folder stores the files that were produced from import
+- `outputs` sub-directory:
+    - this folder stores the exported figures from `make_figures`
+
+## Instructions
+
+Check the following:
+
+- Remember to activate your python environment
+- Ensure that the `db_connection_info.txt` is linked to this directory
+    - e.g. `ln -s /some/directory/db_connection_info.txt .`
+
+To import data, execute the following:
+
+- `cd` into this folder. 
+- `python select_db.py`
+- `python make_csv.py`
+
+Export files will be found in `exports`. This helps to keep the folder clean.
--- a/data_import/2.make_csv.py
+++ b/data_import/2.make_csv.py
@ -2,9 +2,9 @@ import pandas as pd
 import re

 # Load the data_mapping CSV file
-data_mapping_file_path = 'data_import/data_mapping.csv'  # Adjust this path to your actual file location
+data_mapping_file_path = 'exports/data_mapping.csv'  # Adjust this path to your actual file location
 data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
-df_master = pd.read_csv('data_import/data_model_master_export.csv')
+df_master = pd.read_csv('exports/data_model_master_export.csv')

 # Generate patterns
 data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)
@ -23,7 +23,7 @@ fields_to_remove = ['equip_type_code', 'tx_period', 'tx_type', 'on_change_yn', '
 merged_data = data_mapping.drop(columns=fields_to_remove)

 # Save the updated DataFrame to a new CSV file
-output_file_path = 'data_import/raw_data.csv'
+output_file_path = 'exports/raw_data.csv'
 merged_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')

 print(f"Updated data saved to {output_file_path}")
@ -32,7 +32,7 @@ print(f"Updated data saved to {output_file_path}")
 data_mapping_mdm_true = merged_data[merged_data['MDM']]

 # Save the filtered DataFrame to a new CSV file
-mdm_true_output_file_path = 'data_import/data_mapping_mdm.csv'
+mdm_true_output_file_path = 'exports/data_mapping_mdm.csv'
 data_mapping_mdm_true.to_csv(mdm_true_output_file_path, index=False, encoding='utf-8-sig')

 print(f"MDM TRUE data saved to {mdm_true_output_file_path}")
--- a/data_import/make_figures/.gitignore
+++ b/data_import/make_figures/.gitignore
@ -0,0 +1,3 @@
+plot_class_token.ipynb
+plot_count.ipynb
+
--- a/data_import/make_figures/plot_class_token.py
+++ b/data_import/make_figures/plot_class_token.py
@ -0,0 +1,139 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.4
+#   kernelspec:
+#     display_name: torch
+#     language: python
+#     name: python3
+# ---
+
+# %%
+import pandas as pd
+import os
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+
+# CSV 파일 읽기
+df = pd.read_csv('../exports/raw_data.csv')
+
+# MDM이 True인 데이터만 필터링
+mdm_true_df = df[df['MDM'] == True]
+
+# 'thing'과 'property'로 그룹화하여 'tag_description'을 이어붙이기 (NaN을 빈 문자열로 처리)
+tag_description_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_description'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
+
+# 'thing'과 'property'로 그룹화하여 'tag_name'을 이어붙이기 (NaN을 빈 문자열로 처리)
+tag_name_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_name'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
+
+# 'thing'과 'property'의 매핑 개수 계산
+mapping_count = mdm_true_df.groupby(['thing', 'property']).size().reset_index(name='mapping_count')
+
+# 세 개의 데이터프레임 병합: mapping_count, tag_description_concatenated, tag_name_concatenated
+thing_property_grouped = pd.merge(mapping_count, tag_description_concatenated, on=['thing', 'property'])
+thing_property_grouped = pd.merge(thing_property_grouped, tag_name_concatenated, on=['thing', 'property'])
+
+# 'tag_description'에서 공백으로 분리된 토큰 수 계산
+thing_property_grouped['td_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(re.findall(r'\S+', x)))
+
+# 'tag_description'에서 고유한 토큰 수 계산 (unique_token_count)
+thing_property_grouped['unique_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(set(re.findall(r'\S+', x))))
+
+# 'thing'과 'property'에서 숫자를 '#'으로 대체하여 pattern 생성
+thing_property_grouped['pattern'] = thing_property_grouped['thing'].str.replace(r'\d', '#', regex=True) + " " + thing_property_grouped['property'].str.replace(r'\d', '#', regex=True)
+
+# 고유한 thing_property 조합의 총 개수 계산
+total_thing_property_count = thing_property_grouped.shape[0]
+
+# 저장 경로 지정
+output_path = '../outputs/thing_property_grouped.csv'
+
+# 디렉터리 생성 (존재하지 않으면)
+output_dir = os.path.dirname(output_path)
+os.makedirs(output_dir, exist_ok=True)
+
+# 결과를 CSV 파일로 저장
+thing_property_grouped.to_csv(output_path, index=False, encoding='utf-8-sig')
+
+# 결과 출력
+print(f"Concatenated data saved to {output_path}")
+print(f"Total number of unique thing_property combinations: {total_thing_property_count}")
+
+
+# %%
+
+# Left axis: Plotting the histogram for mapping_count
+fig, ax1 = plt.subplots(figsize=(12, 8))
+
+# Histogram for mapping_count
+ax1.hist(thing_property_grouped['mapping_count'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
+ax1.set_xlabel('Mapping Count', fontsize=24, color='black')  # X-axis label with larger font
+ax1.set_ylabel('Frequency', fontsize=24, color='black')      # Y-axis label with larger font
+ax1.grid(True, linestyle='--', alpha=0.7)
+
+# Set axis color to black
+ax1.spines['bottom'].set_color('black')
+ax1.spines['top'].set_color('black') 
+ax1.spines['right'].set_color('black')
+ax1.spines['left'].set_color('black')
+
+# Make tick labels larger
+ax1.tick_params(axis='x', colors='black', labelsize=18)
+ax1.tick_params(axis='y', colors='black', labelsize=18)
+
+# Right axis: Plotting unique_token_count min, max, and average
+ax2 = ax1.twinx()
+
+# Group by mapping_count to calculate min, max, and average of unique_token_count
+grouped_token_stats = thing_property_grouped.groupby('mapping_count')['unique_token_count'].agg(['min', 'max', 'mean']).reset_index()
+
+# Plot the min-max range as a shaded area
+ax2.fill_between(grouped_token_stats['mapping_count'],
+                 grouped_token_stats['min'],
+                 grouped_token_stats['max'],
+                 color='lightgray', alpha=0.5, label='Min-Max Range')
+
+# Plot the average unique_token_count as a line
+ax2.plot(grouped_token_stats['mapping_count'],
+         grouped_token_stats['mean'],
+         color='red', marker='o', linestyle='-', label='Average Unique Token Count')
+
+ax2.set_ylabel('Unique Token Count (Min/Max/Avg)', fontsize=24, color='black')  # Larger font for right Y-axis label
+ax2.tick_params(axis='y', colors='black', labelsize=18)
+
+# Add legends
+ax1.legend(['Frequency'], loc='upper left', fontsize=18)
+ax2.legend(loc='upper right', fontsize=18)
+
+# Add a logarithmic trendline
+# Applying log to mapping_count for the trendline
+log_mapping_count = np.log(grouped_token_stats['mapping_count'])
+
+# Fit a linear model on the log of the mapping_count
+z = np.polyfit(log_mapping_count, grouped_token_stats['mean'], 1)  # Linear fit on log-transformed data
+p = np.poly1d(z)
+
+# Generate x values and corresponding y values for the trendline
+x_vals = np.linspace(grouped_token_stats['mapping_count'].min(), grouped_token_stats['mapping_count'].max(), 500)
+log_x_vals = np.log(x_vals)
+y_vals = p(log_x_vals)
+
+# Plot the logarithmic trendline
+ax2.plot(x_vals, y_vals, color='green', linestyle='--', label='Logarithmic Trendline')
+
+# Add the trendline to the legend
+ax2.legend(loc='upper right', fontsize=18)
+
+plt.tight_layout()
+
+plt.savefig('../outputs/thing-property_histogram_with_char_count.png')
+plt.show()
+
+
+# %%
--- a/data_import/make_figures/plot_count.py
+++ b/data_import/make_figures/plot_count.py
@ -0,0 +1,143 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.4
+#   kernelspec:
+#     display_name: hug
+#     language: python
+#     name: python3
+# ---
+
+# %%
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+# note: we assume that you will execute from the directory of this code
+# check your current directory
+print("Current Working Directory:", os.getcwd())
+
+
+# %%
+
+
+
+# 전체 글꼴 크기 설정
+plt.rcParams.update({'font.size': 18})
+
+# CSV 파일 읽기
+df = pd.read_csv('../exports/raw_data.csv')
+
+# ships_idx 별 전체 갯수 계산
+total_counts = df['ships_idx'].value_counts().sort_index()
+
+# ships_idx 별 MDM=True 인 갯수 계산
+mdm_true_counts = df[df['MDM'] == True]['ships_idx'].value_counts().sort_index()
+
+# 데이터프레임으로 합치기
+summary_df = pd.DataFrame({
+    'SD': total_counts,
+    'PD': mdm_true_counts
+}).fillna(0)  # NaN 값을 0으로 대체
+
+# SD와 PD의 총 갯수 계산
+total_SD = summary_df['SD'].sum()
+total_PD = summary_df['PD'].sum()
+
+# 총 갯수 출력
+print(f"Total SD: {total_SD}")
+print(f"Total PD: {total_PD}")
+
+# 시각화
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Total Counts 먼저 그리기 (굵은 막대로 설정)
+summary_df['SD'].plot(kind='barh', ax=ax, color='orange', alpha=0.5, label='SD', width=0.8)  # 막대 폭을 넓게 설정
+
+# MDM=True Counts를 그 위에 겹쳐서 그리기 (굵은 막대로 설정)
+summary_df['PD'].plot(kind='barh', ax=ax, color='blue', alpha=0.7, label='PD', width=0.8)  # 막대 폭을 넓게 설정
+
+# y축 라벨을 10 단위로 설정
+y_labels = ax.get_yticks()
+ax.set_yticks(np.arange(min(y_labels), max(y_labels) + 1, 10))
+ax.set_yticklabels([int(label) for label in np.arange(min(y_labels), max(y_labels) + 1, 10)])
+
+# 그리드 추가
+ax.grid(True)
+
+# 범례와 제목 설정
+plt.legend(prop={'size': 18})  # 레전드 글꼴 크기 설정
+plt.xlabel('Counts')
+plt.ylabel('Ships')
+
+
+# save to outputs
+# Save the plot to the specified folder
+plt.savefig('../outputs/count_statistics_of_each_ship.png')
+
+# 그래프 출력
+plt.show()
+
+
+# %%
+
+# SD와 PD의 총 갯수 계산
+total_SD = summary_df['SD'].sum()  # SD의 총 갯수
+total_PD = summary_df['PD'].sum()  # PD의 총 갯수
+
+# tag_description의 글자수 계산
+df['tag_description_length'] = df['tag_description'].astype(str).apply(len)
+
+# tag_description의 평균 글자수 계산
+mean_tag_description_length = df['tag_description_length'].mean()
+
+# 결과 출력
+print(f"Tag Description의 평균 글자수: {mean_tag_description_length:.2f}")
+
+# 글자수 분포를 히스토그램으로 시각화
+plt.figure(figsize=(10, 6))
+plt.hist(df['tag_description_length'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
+plt.title('Distribution of Tag Description Lengths')
+plt.xlabel('Tag Description Length (characters)')
+plt.ylabel('Frequency')
+plt.grid(True)
+plt.show()
+
+# tag_description을 ' '로 split한 후 토큰 수 계산
+df['tag_description_tokens'] = df['tag_description'].astype(str).apply(lambda x: len(x.split(' ')))
+
+# tag_description의 평균 토큰 수 계산
+mean_tag_description_tokens = df['tag_description_tokens'].mean()
+
+# 결과 출력
+print(f"Tag Description의 평균 토큰 수: {mean_tag_description_tokens:.2f}")
+
+# 토큰 수 분포를 히스토그램으로 시각화
+plt.figure(figsize=(10, 6))
+plt.hist(df['tag_description_tokens'], bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
+plt.title('Distribution of Tag Description Tokens')
+plt.xlabel('Number of Tokens')
+plt.ylabel('Frequency')
+plt.grid(True)
+plt.show()
+
+# 전체 텍스트에서 모든 토큰 수와 고유 토큰 수 계산
+all_tokens = df['tag_description'].astype(str).apply(lambda x: x.split(' ')).sum()  # 전체 토큰 리스트
+unique_tokens = set(all_tokens)  # 고유 토큰 집합
+
+# 전체 토큰 수와 고유 토큰 수 계산
+total_token_count = len(all_tokens)
+unique_token_count = len(unique_tokens)
+
+# 결과 출력
+print(f"전체 토큰 수: {total_token_count}")
+print(f"고유 토큰 수: {unique_token_count}")
+
+
+# %%
--- a/data_import/plot_class_token.ipynb
+++ b/data_import/plot_class_token.ipynb
--- a/data_import/plot_count.ipynb
+++ b/data_import/plot_count.ipynb
--- a/data_import/1.select_db.py
+++ b/data_import/1.select_db.py
@ -38,7 +38,7 @@ try:
            results_mapping = cursor.fetchall()
            columns_mapping = [desc[0] for desc in cursor.description]
            df_mapping = pd.DataFrame(results_mapping, columns=columns_mapping)
-            df_mapping.to_csv('data_import/data_mapping.csv', index=False, encoding='utf-8-sig')
+            df_mapping.to_csv('exports/data_mapping.csv', index=False, encoding='utf-8-sig')
            
            # Export data_master_model table
            query_master = """
@ -48,9 +48,9 @@ try:
            results_master = cursor.fetchall()
            columns_master = [desc[0] for desc in cursor.description]
            df_master = pd.DataFrame(results_master, columns=columns_master)
-            df_master.to_csv('data_import/data_model_master_export.csv', index=False, encoding='utf-8-sig')
+            df_master.to_csv('exports/data_model_master_export.csv', index=False, encoding='utf-8-sig')

-    print("Data exported successfully to 'data_import/data_mapping.csv' and 'data_import/data_model_master_export.csv'")
+    print("Data exported successfully to 'exports/data_mapping.csv' and 'exports/data_model_master_export.csv'")

 except (Exception, psycopg2.DatabaseError) as error:
    print(f"An error occurred: {error}")