In [1]:
import pandas as pd
import os
# Example usage:1
group_number = 1  # You can change this to any group number you want to load (1, 2, 3, 4, or 5)

# Select the mode for processing
mode = 'tn_td_unit'  # Change this to 'only_td', 'tn_td', etc., as needed

def load_group_data(group_number):
    # Define the folder path based on the group number
    group_folder = os.path.join('../../data_preprocess/dataset', str(group_number))
    
    # Define file paths for train, valid, and test datasets
    train_file_path = os.path.join(group_folder, 'train.csv')
    valid_file_path = os.path.join(group_folder, 'valid.csv')
    test_file_path = os.path.join(group_folder, 'test.csv')
    
    # Check if the files exist
    if not os.path.exists(train_file_path) or not os.path.exists(valid_file_path) or not os.path.exists(test_file_path):
        raise FileNotFoundError(f"One or more files for group {group_number} do not exist.")
    
    # Load the CSV files into DataFrames
    train_data = pd.read_csv(train_file_path)
    valid_data = pd.read_csv(valid_file_path)
    test_data = pd.read_csv(test_file_path)
    
    return train_data, valid_data, test_data


try:
    train_data, valid_data, test_data = load_group_data(group_number)
    print(f"Loaded data for group {group_number}:")
    print(f"Train data shape: {train_data.shape}")
    print(f"Valid data shape: {valid_data.shape}")
    print(f"Test data shape: {test_data.shape}")
except FileNotFoundError as e:
    print(e)

Loaded data for group 1:
Train data shape: (6125, 16)
Valid data shape: (2042, 16)
Test data shape: (14719, 15)


In [2]:
import json
from datasets import Dataset, DatasetDict

# Function to process DataFrame based on mode
def process_df(df, mode='only_td'):
    output_list = []
    for idx, row in df.iterrows():
        try:
            if mode == 'only_td':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td_min_max':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"
            elif mode == 'td_min_max':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"    
            elif mode == 'td_unit':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"      
            elif mode == 'tn_td_unit':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"                     
            else:
                raise ValueError("Invalid mode specified")
            
            output_list.append({
                'translation': {
                    'ships_idx': row['ships_idx'],
                    'input': input_str,
                    'thing_property': f"<THING_START>{str(row['thing'])}<THING_END><PROPERTY_START>{str(row['property'])}<PROPERTY_END>",
                    'answer': f"{str(row['thing'])} {str(row['property'])}",
                }
            })
        except Exception as e:
            print(f"Error processing row at index {idx}: {row}")
            print(f"Exception: {e}")
    return output_list


# Combine the mode and group information into a single dictionary
combined_dict = {
    "mode": mode,
    "fold_group": group_number
}

# Save the combined dictionary to a JSON file
with open("mode.json", "w") as json_file:
    json.dump(combined_dict, json_file)
    
try:
    # Process the data and create a DatasetDict
    combined_data = DatasetDict({
        'train': Dataset.from_list(process_df(train_data, mode=mode)),
        'test': Dataset.from_list(process_df(test_data, mode=mode)),
        'validation': Dataset.from_list(process_df(valid_data, mode=mode)),
    })
    # Save the DatasetDict to disk
    combined_data.save_to_disk(f"combined_data/{mode}/{group_number}")
    print("Dataset saved to 'combined_data'")
except Exception as e:
    print(f"Error creating DatasetDict: {e}")

Saving the dataset (0/1 shards):   0%|          | 0/6125 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14719 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2042 [00:00<?, ? examples/s]

Dataset saved to 'combined_data'
