In [4]:
import pandas as pd
import os
import json
from datasets import Dataset, DatasetDict

group_number = 5
mode = 'td_unit'

def load_group_data(group_number):
    group_folder = os.path.join('../../data_preprocess/dataset', str(group_number))
    train_file_path = os.path.join(group_folder, 'train.csv')
    valid_file_path = os.path.join(group_folder, 'valid.csv')
    test_file_path = os.path.join(group_folder, 'test.csv')
    
    if not all(os.path.exists(f) for f in [train_file_path, valid_file_path, test_file_path]):
        raise FileNotFoundError(f"Files for group {group_number} do not exist.")
    
    return pd.read_csv(train_file_path), pd.read_csv(valid_file_path), pd.read_csv(test_file_path)

train_data, valid_data, test_data = load_group_data(group_number)

def process_df(df, mode='only_td'):
    output_list = []
    for idx, row in df.iterrows():
        try:
            if mode == 'only_td':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END>"
            elif mode == 'tn_td_min_max':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"
            elif mode == 'td_min_max':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>"
            elif mode == 'td_unit':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"
            elif mode == 'tn_td_unit':
                input_str = f"<TN_START>{str(row['tag_name'])}<TN_END><TD_START>{str(row['tag_description'])}<TD_END><UNIT_START>{str(row['unit'])}<UNIT_END>"
            elif mode == 'td_min_max_unit':
                input_str = f"<TD_START>{str(row['tag_description'])}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END><UNIT_START>{str(row['unit'])}<UNIT_END>"
            else:
                raise ValueError("Invalid mode specified")
            
            output_list.append({
                'translation': {
                    'ships_idx': row['ships_idx'],
                    'input': input_str,
                    'thing_property': f"<THING_START>{str(row['thing'])}<THING_END><PROPERTY_START>{str(row['property'])}<PROPERTY_END>",
                    'answer': f"{str(row['thing'])} {str(row['property'])}",
                }
            })
        except Exception as e:
            print(f"Error processing row at index {idx}: {e}")
    return output_list

combined_dict = {"mode": mode, "fold_group": group_number}
with open("mode.json", "w") as json_file:
    json.dump(combined_dict, json_file)

combined_data = DatasetDict({
    'train': Dataset.from_list(process_df(train_data, mode=mode)),
    'test': Dataset.from_list(process_df(test_data, mode=mode)),
    'validation': Dataset.from_list(process_df(valid_data, mode=mode)),
})
combined_data.save_to_disk(f"combined_data/{mode}/{group_number}")
print("Dataset saved to 'combined_data'")


Saving the dataset (0/1 shards):   0%|          | 0/6260 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12969 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2087 [00:00<?, ? examples/s]

Dataset saved to 'combined_data'
