hipom_data_mapping/translation/t5/1.data_process_concat.ipynb

149 lines
5.5 KiB
Plaintext
Raw Normal View History

2024-08-26 19:51:11 +09:00
{
"cells": [
{
"cell_type": "code",
2024-09-25 08:52:30 +09:00
"execution_count": 8,
2024-08-26 19:51:11 +09:00
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
2024-09-25 08:52:30 +09:00
"model_id": "7d3d34e404f94388a89f0c9b1aa814e6",
2024-08-26 19:51:11 +09:00
"version_major": 2,
"version_minor": 0
},
"text/plain": [
2024-09-25 08:52:30 +09:00
"Saving the dataset (0/1 shards): 0%| | 0/6260 [00:00<?, ? examples/s]"
2024-08-26 19:51:11 +09:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
2024-09-25 08:52:30 +09:00
"model_id": "7b49ec520b674b39b34a8c28ff480716",
2024-08-26 19:51:11 +09:00
"version_major": 2,
"version_minor": 0
},
"text/plain": [
2024-09-25 08:52:30 +09:00
"Saving the dataset (0/1 shards): 0%| | 0/12969 [00:00<?, ? examples/s]"
2024-08-26 19:51:11 +09:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
2024-09-25 08:52:30 +09:00
"model_id": "c06c7ee55f174bb5b030983c52adbace",
2024-08-26 19:51:11 +09:00
"version_major": 2,
"version_minor": 0
},
"text/plain": [
2024-09-25 08:52:30 +09:00
"Saving the dataset (0/1 shards): 0%| | 0/2087 [00:00<?, ? examples/s]"
2024-08-26 19:51:11 +09:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset saved to 'combined_data'\n"
]
}
],
"source": [
2024-09-25 08:52:30 +09:00
"import pandas as pd\n",
"import os\n",
2024-08-26 19:51:11 +09:00
"import json\n",
"from datasets import Dataset, DatasetDict\n",
"\n",
2024-09-25 08:52:30 +09:00
"group_number = 5\n",
"mode = 'td_unit'\n",
"\n",
"def load_group_data(group_number):\n",
" group_folder = os.path.join('../../data_preprocess/dataset', str(group_number))\n",
" train_file_path = os.path.join(group_folder, 'train.csv')\n",
" valid_file_path = os.path.join(group_folder, 'valid.csv')\n",
" test_file_path = os.path.join(group_folder, 'test.csv')\n",
" \n",
" if not os.path.exists(train_file_path) or not os.path.exists(valid_file_path) or not os.path.exists(test_file_path):\n",
" raise FileNotFoundError(f\"Files for group {group_number} not found.\")\n",
" \n",
" return pd.read_csv(train_file_path), pd.read_csv(valid_file_path), pd.read_csv(test_file_path)\n",
"\n",
2024-08-26 19:51:11 +09:00
"def process_df(df, mode='only_td'):\n",
" output_list = []\n",
" for idx, row in df.iterrows():\n",
" try:\n",
" if mode == 'only_td':\n",
2024-09-25 08:52:30 +09:00
" input_str = f\"<TD_START>{row['tag_description']}<TD_END>\"\n",
2024-08-26 19:51:11 +09:00
" elif mode == 'tn_td':\n",
2024-09-25 08:52:30 +09:00
" input_str = f\"<TN_START>{row['tag_name']}<TN_END><TD_START>{row['tag_description']}<TD_END>\"\n",
2024-08-26 19:51:11 +09:00
" elif mode == 'tn_td_min_max':\n",
2024-09-25 08:52:30 +09:00
" input_str = f\"<TN_START>{row['tag_name']}<TN_END><TD_START>{row['tag_description']}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>\"\n",
2024-08-26 19:51:11 +09:00
" elif mode == 'td_min_max':\n",
2024-09-25 08:52:30 +09:00
" input_str = f\"<TD_START>{row['tag_description']}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END>\"\n",
2024-08-26 19:51:11 +09:00
" elif mode == 'td_unit':\n",
2024-09-25 08:52:30 +09:00
" input_str = f\"<TD_START>{row['tag_description']}<TD_END><UNIT_START>{row['unit']}<UNIT_END>\"\n",
2024-08-26 19:51:11 +09:00
" elif mode == 'tn_td_unit':\n",
2024-09-25 08:52:30 +09:00
" input_str = f\"<TN_START>{row['tag_name']}<TN_END><TD_START>{row['tag_description']}<TD_END><UNIT_START>{row['unit']}<UNIT_END>\"\n",
" elif mode == 'td_min_max_unit':\n",
" input_str = f\"<TD_START>{row['tag_description']}<TD_END><MIN_START>{row['min']}<MIN_END><MAX_START>{row['max']}<MAX_END><UNIT_START>{row['unit']}<UNIT_END>\"\n",
2024-08-26 19:51:11 +09:00
" else:\n",
" raise ValueError(\"Invalid mode specified\")\n",
" \n",
" output_list.append({\n",
" 'translation': {\n",
" 'ships_idx': row['ships_idx'],\n",
" 'input': input_str,\n",
2024-09-25 08:52:30 +09:00
" 'thing_property': f\"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>\",\n",
" 'answer': f\"{row['thing']} {row['property']}\",\n",
2024-08-26 19:51:11 +09:00
" }\n",
" })\n",
" except Exception as e:\n",
2024-09-25 08:52:30 +09:00
" print(f\"Error processing row at index {idx}: {e}\")\n",
2024-08-26 19:51:11 +09:00
" return output_list\n",
"\n",
2024-09-25 08:52:30 +09:00
"train_data, valid_data, test_data = load_group_data(group_number)\n",
"combined_dict = {\"mode\": mode, \"fold_group\": group_number}\n",
2024-08-26 19:51:11 +09:00
"with open(\"mode.json\", \"w\") as json_file:\n",
" json.dump(combined_dict, json_file)\n",
2024-09-25 08:52:30 +09:00
"\n",
"combined_data = DatasetDict({\n",
" 'train': Dataset.from_list(process_df(train_data, mode=mode)),\n",
" 'test': Dataset.from_list(process_df(test_data, mode=mode)),\n",
" 'validation': Dataset.from_list(process_df(valid_data, mode=mode)),\n",
"})\n",
"\n",
"combined_data.save_to_disk(f\"combined_data/{mode}/{group_number}\")\n",
"print(\"Dataset saved to 'combined_data'\")\n"
2024-08-26 19:51:11 +09:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}