{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# process epub htmls into a single html \n", "\n", "Goal: I want to merge html files from an epub into a single html document. I also want working links, so I need to ensure that the hrefs and relevant tag id's are defined properly to work within the document" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# investigate each one html document links and hrefs\n", "\n", "I have already unzipped an epub document into the folder `epub` in the same directory" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# collect all html files in outer and inner directories\n", "directory = './epub'\n", "outer_html_files = glob.glob(os.path.join(directory, '*.xhtml'))\n", "\n", "inner_directory = './epub/text'\n", "inner_html_files = glob.glob(os.path.join(inner_directory, '*.html'))\n", "\n", "# the ordering depends heavily on what you have\n", "outer_html_files = outer_html_files[::-1]\n", "inner_html_files = sorted(inner_html_files)\n", "html_files = outer_html_files + inner_html_files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# let us look at a sample html\n", "with open(html_files[2], 'r', encoding='utf-8') as file:\n", " soup = BeautifulSoup(file, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[
\n", "\n", " \n", " \n", " Cover\n", " \n", " \n", "
\n", "\n", " \n", " \n", " Title Page\n", " \n", " \n", "
\n", "\n", "
\n", "\n", " \n", " \n", " Cover\n", " \n", " \n", "
\n", "\n", " \n", " \n", " Title Page\n", " \n", " \n", "
\n", "\n", "
\n", "