Chore: re-organized data_import directory to use .py files

Doc: added README.md explaining purpose of each file and instructions
This commit is contained in:
Richard Wong 2024-10-29 20:07:51 +09:00
parent 24829c7abf
commit 67f3712ea6
11 changed files with 348 additions and 402 deletions

0
.gitignore vendored Normal file
View File

View File

@ -1 +1,11 @@
# hipom_data_mapping
# hipom_data_mapping
## Before we begin
This repository utilizes `.py` files rather than `.ipynb` for greater clarity.
If you use vscode, just use the ipython functionality from `.py` files.
In order to generate `.ipynb` file from `.py` file, you can do the following:
`jupytext --to notebook your_script.py`

3
data_import/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
db_connection_info.txt
exports/*
outputs/*

42
data_import/README.md Normal file
View File

@ -0,0 +1,42 @@
# Data Import
## What is this folder
This folder contains the files needed to import files from the remote database
to local csv files.
This folder contains the following files:
- `select_db.py`:
- use this to pull the raw datasets `data_mapping.csv` and
`data_model_master_export.csv`
- `make_csv.py`:
- perform basic processing
- produces the following files:
- `raw_data.csv`: `data_mapping.csv` without some fields
- `data_mapping_mdm.csv`: mdm subset of `raw_data.csv`
- `make_figures` sub-directory
- `plot_class_token.ipynb`: get number of thing-property combinations, and
plot the histogram of thing-property counts along with the tag_description
character counts
- `plot_count.ipynb`: get counts of ship-data and platform-data
- `exports` sub-directory:
- this folder stores the files that were produced from import
- `outputs` sub-directory:
- this folder stores the exported figures from `make_figures`
## Instructions
Check the following:
- Remember to activate your python environment
- Ensure that the `db_connection_info.txt` is linked to this directory
- e.g. `ln -s /some/directory/db_connection_info.txt .`
To import data, execute the following:
- `cd` into this folder.
- `python select_db.py`
- `python make_csv.py`
Export files will be found in `exports`. This helps to keep the folder clean.

View File

@ -2,9 +2,9 @@ import pandas as pd
import re
# Load the data_mapping CSV file
data_mapping_file_path = 'data_import/data_mapping.csv' # Adjust this path to your actual file location
data_mapping_file_path = 'exports/data_mapping.csv' # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)
df_master = pd.read_csv('data_import/data_model_master_export.csv')
df_master = pd.read_csv('exports/data_model_master_export.csv')
# Generate patterns
data_mapping['thing_pattern'] = data_mapping['thing'].str.replace(r'\d', '#', regex=True)
@ -23,7 +23,7 @@ fields_to_remove = ['equip_type_code', 'tx_period', 'tx_type', 'on_change_yn', '
merged_data = data_mapping.drop(columns=fields_to_remove)
# Save the updated DataFrame to a new CSV file
output_file_path = 'data_import/raw_data.csv'
output_file_path = 'exports/raw_data.csv'
merged_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')
print(f"Updated data saved to {output_file_path}")
@ -32,7 +32,7 @@ print(f"Updated data saved to {output_file_path}")
data_mapping_mdm_true = merged_data[merged_data['MDM']]
# Save the filtered DataFrame to a new CSV file
mdm_true_output_file_path = 'data_import/data_mapping_mdm.csv'
mdm_true_output_file_path = 'exports/data_mapping_mdm.csv'
data_mapping_mdm_true.to_csv(mdm_true_output_file_path, index=False, encoding='utf-8-sig')
print(f"MDM TRUE data saved to {mdm_true_output_file_path}")

3
data_import/make_figures/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
plot_class_token.ipynb
plot_count.ipynb

View File

@ -0,0 +1,139 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.4
# kernelspec:
# display_name: torch
# language: python
# name: python3
# ---
# %%
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
# CSV 파일 읽기
df = pd.read_csv('../exports/raw_data.csv')
# MDM이 True인 데이터만 필터링
mdm_true_df = df[df['MDM'] == True]
# 'thing'과 'property'로 그룹화하여 'tag_description'을 이어붙이기 (NaN을 빈 문자열로 처리)
tag_description_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_description'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
# 'thing'과 'property'로 그룹화하여 'tag_name'을 이어붙이기 (NaN을 빈 문자열로 처리)
tag_name_concatenated = mdm_true_df.groupby(['thing', 'property'])['tag_name'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
# 'thing'과 'property'의 매핑 개수 계산
mapping_count = mdm_true_df.groupby(['thing', 'property']).size().reset_index(name='mapping_count')
# 세 개의 데이터프레임 병합: mapping_count, tag_description_concatenated, tag_name_concatenated
thing_property_grouped = pd.merge(mapping_count, tag_description_concatenated, on=['thing', 'property'])
thing_property_grouped = pd.merge(thing_property_grouped, tag_name_concatenated, on=['thing', 'property'])
# 'tag_description'에서 공백으로 분리된 토큰 수 계산
thing_property_grouped['td_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(re.findall(r'\S+', x)))
# 'tag_description'에서 고유한 토큰 수 계산 (unique_token_count)
thing_property_grouped['unique_token_count'] = thing_property_grouped['tag_description'].apply(lambda x: len(set(re.findall(r'\S+', x))))
# 'thing'과 'property'에서 숫자를 '#'으로 대체하여 pattern 생성
thing_property_grouped['pattern'] = thing_property_grouped['thing'].str.replace(r'\d', '#', regex=True) + " " + thing_property_grouped['property'].str.replace(r'\d', '#', regex=True)
# 고유한 thing_property 조합의 총 개수 계산
total_thing_property_count = thing_property_grouped.shape[0]
# 저장 경로 지정
output_path = '../outputs/thing_property_grouped.csv'
# 디렉터리 생성 (존재하지 않으면)
output_dir = os.path.dirname(output_path)
os.makedirs(output_dir, exist_ok=True)
# 결과를 CSV 파일로 저장
thing_property_grouped.to_csv(output_path, index=False, encoding='utf-8-sig')
# 결과 출력
print(f"Concatenated data saved to {output_path}")
print(f"Total number of unique thing_property combinations: {total_thing_property_count}")
# %%
# Left axis: Plotting the histogram for mapping_count
fig, ax1 = plt.subplots(figsize=(12, 8))
# Histogram for mapping_count
ax1.hist(thing_property_grouped['mapping_count'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
ax1.set_xlabel('Mapping Count', fontsize=24, color='black') # X-axis label with larger font
ax1.set_ylabel('Frequency', fontsize=24, color='black') # Y-axis label with larger font
ax1.grid(True, linestyle='--', alpha=0.7)
# Set axis color to black
ax1.spines['bottom'].set_color('black')
ax1.spines['top'].set_color('black')
ax1.spines['right'].set_color('black')
ax1.spines['left'].set_color('black')
# Make tick labels larger
ax1.tick_params(axis='x', colors='black', labelsize=18)
ax1.tick_params(axis='y', colors='black', labelsize=18)
# Right axis: Plotting unique_token_count min, max, and average
ax2 = ax1.twinx()
# Group by mapping_count to calculate min, max, and average of unique_token_count
grouped_token_stats = thing_property_grouped.groupby('mapping_count')['unique_token_count'].agg(['min', 'max', 'mean']).reset_index()
# Plot the min-max range as a shaded area
ax2.fill_between(grouped_token_stats['mapping_count'],
grouped_token_stats['min'],
grouped_token_stats['max'],
color='lightgray', alpha=0.5, label='Min-Max Range')
# Plot the average unique_token_count as a line
ax2.plot(grouped_token_stats['mapping_count'],
grouped_token_stats['mean'],
color='red', marker='o', linestyle='-', label='Average Unique Token Count')
ax2.set_ylabel('Unique Token Count (Min/Max/Avg)', fontsize=24, color='black') # Larger font for right Y-axis label
ax2.tick_params(axis='y', colors='black', labelsize=18)
# Add legends
ax1.legend(['Frequency'], loc='upper left', fontsize=18)
ax2.legend(loc='upper right', fontsize=18)
# Add a logarithmic trendline
# Applying log to mapping_count for the trendline
log_mapping_count = np.log(grouped_token_stats['mapping_count'])
# Fit a linear model on the log of the mapping_count
z = np.polyfit(log_mapping_count, grouped_token_stats['mean'], 1) # Linear fit on log-transformed data
p = np.poly1d(z)
# Generate x values and corresponding y values for the trendline
x_vals = np.linspace(grouped_token_stats['mapping_count'].min(), grouped_token_stats['mapping_count'].max(), 500)
log_x_vals = np.log(x_vals)
y_vals = p(log_x_vals)
# Plot the logarithmic trendline
ax2.plot(x_vals, y_vals, color='green', linestyle='--', label='Logarithmic Trendline')
# Add the trendline to the legend
ax2.legend(loc='upper right', fontsize=18)
plt.tight_layout()
plt.savefig('../outputs/thing-property_histogram_with_char_count.png')
plt.show()
# %%

View File

@ -0,0 +1,143 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.4
# kernelspec:
# display_name: hug
# language: python
# name: python3
# ---
# %%
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
# note: we assume that you will execute from the directory of this code
# check your current directory
print("Current Working Directory:", os.getcwd())
# %%
# 전체 글꼴 크기 설정
plt.rcParams.update({'font.size': 18})
# CSV 파일 읽기
df = pd.read_csv('../exports/raw_data.csv')
# ships_idx 별 전체 갯수 계산
total_counts = df['ships_idx'].value_counts().sort_index()
# ships_idx 별 MDM=True 인 갯수 계산
mdm_true_counts = df[df['MDM'] == True]['ships_idx'].value_counts().sort_index()
# 데이터프레임으로 합치기
summary_df = pd.DataFrame({
'SD': total_counts,
'PD': mdm_true_counts
}).fillna(0) # NaN 값을 0으로 대체
# SD와 PD의 총 갯수 계산
total_SD = summary_df['SD'].sum()
total_PD = summary_df['PD'].sum()
# 총 갯수 출력
print(f"Total SD: {total_SD}")
print(f"Total PD: {total_PD}")
# 시각화
fig, ax = plt.subplots(figsize=(10, 6))
# Total Counts 먼저 그리기 (굵은 막대로 설정)
summary_df['SD'].plot(kind='barh', ax=ax, color='orange', alpha=0.5, label='SD', width=0.8) # 막대 폭을 넓게 설정
# MDM=True Counts를 그 위에 겹쳐서 그리기 (굵은 막대로 설정)
summary_df['PD'].plot(kind='barh', ax=ax, color='blue', alpha=0.7, label='PD', width=0.8) # 막대 폭을 넓게 설정
# y축 라벨을 10 단위로 설정
y_labels = ax.get_yticks()
ax.set_yticks(np.arange(min(y_labels), max(y_labels) + 1, 10))
ax.set_yticklabels([int(label) for label in np.arange(min(y_labels), max(y_labels) + 1, 10)])
# 그리드 추가
ax.grid(True)
# 범례와 제목 설정
plt.legend(prop={'size': 18}) # 레전드 글꼴 크기 설정
plt.xlabel('Counts')
plt.ylabel('Ships')
# save to outputs
# Save the plot to the specified folder
plt.savefig('../outputs/count_statistics_of_each_ship.png')
# 그래프 출력
plt.show()
# %%
# SD와 PD의 총 갯수 계산
total_SD = summary_df['SD'].sum() # SD의 총 갯수
total_PD = summary_df['PD'].sum() # PD의 총 갯수
# tag_description의 글자수 계산
df['tag_description_length'] = df['tag_description'].astype(str).apply(len)
# tag_description의 평균 글자수 계산
mean_tag_description_length = df['tag_description_length'].mean()
# 결과 출력
print(f"Tag Description의 평균 글자수: {mean_tag_description_length:.2f}")
# 글자수 분포를 히스토그램으로 시각화
plt.figure(figsize=(10, 6))
plt.hist(df['tag_description_length'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Tag Description Lengths')
plt.xlabel('Tag Description Length (characters)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# tag_description을 ' '로 split한 후 토큰 수 계산
df['tag_description_tokens'] = df['tag_description'].astype(str).apply(lambda x: len(x.split(' ')))
# tag_description의 평균 토큰 수 계산
mean_tag_description_tokens = df['tag_description_tokens'].mean()
# 결과 출력
print(f"Tag Description의 평균 토큰 수: {mean_tag_description_tokens:.2f}")
# 토큰 수 분포를 히스토그램으로 시각화
plt.figure(figsize=(10, 6))
plt.hist(df['tag_description_tokens'], bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
plt.title('Distribution of Tag Description Tokens')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 전체 텍스트에서 모든 토큰 수와 고유 토큰 수 계산
all_tokens = df['tag_description'].astype(str).apply(lambda x: x.split(' ')).sum() # 전체 토큰 리스트
unique_tokens = set(all_tokens) # 고유 토큰 집합
# 전체 토큰 수와 고유 토큰 수 계산
total_token_count = len(all_tokens)
unique_token_count = len(unique_tokens)
# 결과 출력
print(f"전체 토큰 수: {total_token_count}")
print(f"고유 토큰 수: {unique_token_count}")
# %%

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -38,7 +38,7 @@ try:
results_mapping = cursor.fetchall()
columns_mapping = [desc[0] for desc in cursor.description]
df_mapping = pd.DataFrame(results_mapping, columns=columns_mapping)
df_mapping.to_csv('data_import/data_mapping.csv', index=False, encoding='utf-8-sig')
df_mapping.to_csv('exports/data_mapping.csv', index=False, encoding='utf-8-sig')
# Export data_master_model table
query_master = """
@ -48,9 +48,9 @@ try:
results_master = cursor.fetchall()
columns_master = [desc[0] for desc in cursor.description]
df_master = pd.DataFrame(results_master, columns=columns_master)
df_master.to_csv('data_import/data_model_master_export.csv', index=False, encoding='utf-8-sig')
df_master.to_csv('exports/data_model_master_export.csv', index=False, encoding='utf-8-sig')
print("Data exported successfully to 'data_import/data_mapping.csv' and 'data_import/data_model_master_export.csv'")
print("Data exported successfully to 'exports/data_mapping.csv' and 'exports/data_model_master_export.csv'")
except (Exception, psycopg2.DatabaseError) as error:
print(f"An error occurred: {error}")