From 5c45d95e482ca65d4524e8e03af6fb8da252b7b0 Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Sun, 28 Apr 2024 20:48:46 +0900 Subject: [PATCH] Feat: implemented html merging with working links --- .gitignore | 3 + bs4_approach.ipynb | 1135 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1138 insertions(+) create mode 100644 .gitignore create mode 100644 bs4_approach.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5e6dc8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +epub/ +output.html + diff --git a/bs4_approach.ipynb b/bs4_approach.ipynb new file mode 100644 index 0000000..0588388 --- /dev/null +++ b/bs4_approach.ipynb @@ -0,0 +1,1135 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# process epub htmls into a single html \n", + "\n", + "Goal: I want to merge html files from an epub into a single html document. I also want working links, so I need to ensure that the hrefs and relevant tag id's are defined properly to work within the document" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "from bs4 import BeautifulSoup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# investigate each one html document links and hrefs\n", + "\n", + "I have already unzipped an epub document into the folder `epub` in the same directory" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# collect all html files in outer and inner directories\n", + "directory = './epub'\n", + "outer_html_files = glob.glob(os.path.join(directory, '*.xhtml'))\n", + "\n", + "inner_directory = './epub/text'\n", + "inner_html_files = glob.glob(os.path.join(inner_directory, '*.html'))\n", + "\n", + "# the ordering depends heavily on what you have\n", + "outer_html_files = outer_html_files[::-1]\n", + "inner_html_files = sorted(inner_html_files)\n", + "html_files = outer_html_files + inner_html_files" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# let us look at a sample html\n", + "with open(html_files[2], 'r', encoding='utf-8') as file:\n", + " soup = BeautifulSoup(file, 'html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[\n", + "
Frank Herbert's Dune Saga Collection: Books 1 - 6
\n", + "
Dune
\n", + "
Dune Messiah
\n", + "
Children of Dune
\n", + "
God Emperor of Dune
\n", + "
Heretics of Dune
\n", + "
Chapterhouse: Dune
\n", + "
Frank Herbert
\n", + " ]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get all tags\n", + "soup.find_all(id=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0-9d088d4ffd7e4135ab987c63f18099d5'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the id of the body\n", + "soup.find('body').get('id')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "output_doc = BeautifulSoup()\n", + "output_doc.append(output_doc.new_tag(\"html\"))\n", + "output_doc.html.append(output_doc.new_tag(\"body\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Problem: when just getting appending the body, all the body tags get consolidated into one large outer body tag, so the id's disappear.\n", + "\n", + "The solution is to create a new div for each html file, then append the body to the div and then append the div into the output document" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# read 2 files\n", + "for file in html_files[3:4+1]:\n", + " with open(html_files[3], 'r', encoding='utf-8') as file:\n", + " soup = BeautifulSoup(file, 'html.parser')\n", + " body_content = soup.find('body')\n", + " # the trick to preserve the body tag is to hide the body in a div\n", + " div = output_doc.new_tag('div', id=(body_content.get('id')))\n", + " div.append(body_content.extract())\n", + " output_doc.append(div)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + "\n", + "
\n", + " \n", + "

\n", + " Table of Contents\n", + "

\n", + "

\n", + " \n", + " \n", + " Cover\n", + " \n", + " \n", + "

\n", + "

\n", + " \n", + " \n", + " Title Page\n", + " \n", + " \n", + "

\n", + "

\n", + "

\n", + "
\n", + " \n", + " Dune\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " \n", + "

\n", + " Table of Contents\n", + "

\n", + "

\n", + " \n", + " \n", + " Cover\n", + " \n", + " \n", + "

\n", + "

\n", + " \n", + " \n", + " Title Page\n", + " \n", + " \n", + "

\n", + "

\n", + "

\n", + "
\n", + " \n", + " Dune\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "\n" + ] + } + ], + "source": [ + "print(output_doc.prettify())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how the `body` is preserved from each html file\n", + "\n", + "The available id with the original id name allows hrefs to reach this section of the document.\n", + "\n", + "However, notice that the hrefs have the name \"partX.html#id\".\n", + "\n", + "Let's try to find all the href variants" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "output_doc = BeautifulSoup()\n", + "output_doc.append(output_doc.new_tag(\"html\"))\n", + "output_doc.html.append(output_doc.new_tag(\"body\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "for file in html_files:\n", + " with open(file, 'r') as html_file:\n", + " soup = BeautifulSoup(html_file, \"html.parser\")\n", + " body_content = soup.find('body')\n", + " # the trick to preserve the id is to hide the body in a div with the same id\n", + " div = output_doc.new_tag('div', id=body_content.get('id'))\n", + " div.append(body_content.extract())\n", + " output_doc.append(div)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "href_list = [ link.get('href') for link in output_doc.find_all('a')]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['text/part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0001.html#UGI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0002.html#x9781101157879_EPUB',\n", + " 'text/part0003_split_000.html#x9781101157879_EPUB-1',\n", + " 'text/part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0005.html#4OIQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0006.html#5N3C0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0007.html#6LJU0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0008.html#7K4G0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0009.html#8IL20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0010.html#9H5K0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0011.html#AFM60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0012.html#BE6O0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0013.html#CCNA0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0014.html#DB7S0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0015.html#E9OE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0016.html#F8900-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0017.html#G6PI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0018.html#H5A40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0019.html#I3QM0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0020.html#J2B80-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0021.html#K0RQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0022.html#KVCC0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0023.html#LTSU0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0024.html#MSDG0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0025.html#NQU20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0026.html#OPEK0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0027.html#PNV60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0028.html#QMFO0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0029.html#RL0A0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0030.html#SJGS0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0031.html#TI1E0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0032.html#UGI00-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0033.html#VF2I0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0034.html#10DJ40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0035.html#11C3M0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0036.html#12AK80-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0037.html#1394Q0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0038.html#147LC0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0039.html#1565U0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0040.html#164MG0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0041.html#173720-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0042.html#181NK0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0043.html#190860-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0044.html#19UOO0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0045.html#1AT9A0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0046.html#1BRPS0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0047.html#1CQAE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0048.html#1DOR00-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0049.html#1ENBI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0050.html#1FLS40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0051.html#1GKCM0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0052.html#1HIT80-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0053.html#1IHDQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0054.html#1JFUC0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0055.html#1KEEU0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0056.html#1LCVG0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0057.html#1MBG20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0058.html#1NA0K0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0059.html#1O8H60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0061.html#1Q5IA0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0062.html#1R42S0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0063.html#1S2JE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0064.html#1T1400-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0065.html#1TVKI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0066.html#1UU540-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0067.html#1VSLM0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0068.html#20R680-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0070.html#x9781101157879_EPUB-4',\n", + " 'text/part0071.html#x9781101157879_EPUB-2',\n", + " 'text/part0072.html#24L8G0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0073.html#x9781101157879_EPUB-3',\n", + " 'text/part0074.html#x9781101157879_EPUB-5',\n", + " 'text/part0075.html#x9781101157879_EPUB-6',\n", + " 'text/part0076.html#x9781101157879_EPUB-7',\n", + " 'text/part0077.html#x9781101157879_EPUB-8',\n", + " 'text/part0078.html#x9781101157879_EPUB-9',\n", + " 'text/part0079.html#x9781101157879_EPUB-10',\n", + " 'text/part0080.html#x9781101157879_EPUB-11',\n", + " 'text/part0081.html#x9781101157879_EPUB-12',\n", + " 'text/part0082.html#x9781101157879_EPUB-13',\n", + " 'text/part0083.html#x9781101157879_EPUB-14',\n", + " 'text/part0084.html#x9781101157879_EPUB-15',\n", + " 'text/part0085.html#x9781101157879_EPUB-16',\n", + " 'text/part0086.html#x9781101157879_EPUB-17',\n", + " 'text/part0087.html#x9781101157879_EPUB-18',\n", + " 'text/part0088.html#x9781101157879_EPUB-19',\n", + " 'text/part0089.html#x9781101157879_EPUB-20',\n", + " 'text/part0090.html#x9781101157879_EPUB-21',\n", + " 'text/part0091.html#x9781101157879_EPUB-22',\n", + " 'text/part0092.html#x9781101157879_EPUB-23',\n", + " 'text/part0093.html#x9781101157879_EPUB-24',\n", + " 'text/part0094.html#x9781101157879_EPUB-25',\n", + " 'text/part0095.html#x9781101157879_EPUB-26',\n", + " 'text/part0096.html#x9781101157879_EPUB-27',\n", + " 'text/part0097.html#x9781101157879_EPUB-28',\n", + " 'text/part0098.html#x9781101157879_EPUB-29',\n", + " 'text/part0099.html#x9781101157879_EPUB-30',\n", + " 'text/part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0102.html#x9781440630514_EPUB-2',\n", + " 'text/part0103.html#x9781440630514_EPUB-3',\n", + " 'text/part0104.html#x9781440630514_EPUB-4',\n", + " 'text/part0105.html#344B20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0106.html#x9781440630514_EPUB-5',\n", + " 'text/part0107.html#x9781440630514_EPUB-6',\n", + " 'text/part0108.html#x9781440630514_EPUB-7',\n", + " 'text/part0109.html#x9781440630514_EPUB-8',\n", + " 'text/part0110.html#x9781440630514_EPUB-9',\n", + " 'text/part0111.html#x9781440630514_EPUB-10',\n", + " 'text/part0112.html#x9781440630514_EPUB-11',\n", + " 'text/part0113.html#x9781440630514_EPUB-12',\n", + " 'text/part0114.html#x9781440630514_EPUB-13',\n", + " 'text/part0115.html#x9781440630514_EPUB-14',\n", + " 'text/part0116.html#x9781440630514_EPUB-15',\n", + " 'text/part0117.html#x9781440630514_EPUB-16',\n", + " 'text/part0118.html#x9781440630514_EPUB-17',\n", + " 'text/part0119.html#x9781440630514_EPUB-18',\n", + " 'text/part0120.html#x9781440630514_EPUB-19',\n", + " 'text/part0121.html#x9781440630514_EPUB-20',\n", + " 'text/part0122.html#x9781440630514_EPUB-21',\n", + " 'text/part0123.html#x9781440630514_EPUB-22',\n", + " 'text/part0124.html#x9781440630514_EPUB-23',\n", + " 'text/part0125.html#x9781440630514_EPUB-24',\n", + " 'text/part0126.html#x9781440630514_EPUB-25',\n", + " 'text/part0127.html#x9781440630514_EPUB-26',\n", + " 'text/part0128.html#x9781440630514_EPUB-27',\n", + " 'text/part0129.html#x9781440630514_EPUB-28',\n", + " 'text/part0130.html#x9781440630514_EPUB-29',\n", + " 'text/part0131.html#x9781440630514_EPUB-30',\n", + " 'text/part0132.html#x9781440630514_EPUB-31',\n", + " 'text/part0133.html#x9781440630514_EPUB-32',\n", + " 'text/part0134.html#x9781440630514_EPUB-33',\n", + " 'text/part0135.html#x9781440630514_EPUB-34',\n", + " 'text/part0136.html#x9781440630514_EPUB-35',\n", + " 'text/part0137.html#x9781440630514_EPUB-36',\n", + " 'text/part0138.html#x9781440630514_EPUB-37',\n", + " 'text/part0139.html#x9781440630514_EPUB-38',\n", + " 'text/part0140.html#x9781440630514_EPUB-39',\n", + " 'text/part0141.html#x9781440630514_EPUB-40',\n", + " 'text/part0142.html#x9781440630514_EPUB-41',\n", + " 'text/part0143.html#x9781440630514_EPUB-42',\n", + " 'text/part0144.html#x9781440630514_EPUB-43',\n", + " 'text/part0145.html#x9781440630514_EPUB-44',\n", + " 'text/part0146.html#x9781440630514_EPUB-45',\n", + " 'text/part0147.html#x9781440630514_EPUB-46',\n", + " 'text/part0148.html#x9781440630514_EPUB-47',\n", + " 'text/part0149.html#x9781440630514_EPUB-48',\n", + " 'text/part0150.html#x9781440630514_EPUB-49',\n", + " 'text/part0151.html#x9781440630514_EPUB-50',\n", + " 'text/part0152.html#x9781440630514_EPUB-51',\n", + " 'text/part0153.html#x9781440630514_EPUB-52',\n", + " 'text/part0154.html#x9781440630514_EPUB-53',\n", + " 'text/part0155.html#x9781440630514_EPUB-54',\n", + " 'text/part0156.html#x9781440630514_EPUB-55',\n", + " 'text/part0157.html#x9781440630514_EPUB-56',\n", + " 'text/part0158.html#x9781440630514_EPUB-57',\n", + " 'text/part0159.html#x9781440630514_EPUB-58',\n", + " 'text/part0160.html#x9781440630514_EPUB-59',\n", + " 'text/part0161.html#x9781440630514_EPUB-60',\n", + " 'text/part0162.html#x9781440630514_EPUB-61',\n", + " 'text/part0163.html#x9781440630514_EPUB-62',\n", + " 'text/part0164.html#x9781440630514_EPUB-63',\n", + " 'text/part0165.html#x9781440630514_EPUB-64',\n", + " 'text/part0166.html#x9781440630514_EPUB-65',\n", + " 'text/part0167.html#x9781440630514_EPUB-66',\n", + " 'text/part0168.html#x9781440630514_EPUB-67',\n", + " 'text/part0169.html#x9781440630514_EPUB-68',\n", + " 'text/part0170.html#x9781440630514_EPUB-69',\n", + " 'text/part0171.html#x9781440630514_EPUB-70',\n", + " 'text/part0172.html#5410O0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0173.html#x9781440631979_EPUB-2',\n", + " 'text/part0174.html#x9781440631979_EPUB-3',\n", + " 'text/part0175.html#x9781440631979_EPUB-4',\n", + " 'text/part0176.html#57R300-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0177.html#x9781440631979_EPUB-5',\n", + " 'text/part0179.html#x9781440631979_EPUB-7',\n", + " 'text/part0182.html#x9781440631979_EPUB-10',\n", + " 'text/part0183.html#x9781440631979_EPUB-11',\n", + " 'text/part0184.html#x9781440631979_EPUB-12',\n", + " 'text/part0185.html#x9781440631979_EPUB-13',\n", + " 'text/part0186.html#x9781440631979_EPUB-14',\n", + " 'text/part0187_split_000.html#x9781440631979_EPUB-15',\n", + " 'text/part0188.html#x9781440631979_EPUB-16',\n", + " 'text/part0189_split_000.html#x9781440631979_EPUB-17',\n", + " 'text/part0190.html#x9781440631979_EPUB-18',\n", + " 'text/part0191.html#x9781440631979_EPUB-19',\n", + " 'text/part0192.html#x9781440631979_EPUB-20',\n", + " 'text/part0193.html#x9781440631979_EPUB-21',\n", + " 'text/part0194.html#x9781440631979_EPUB-22',\n", + " 'text/part0195.html#x9781440631979_EPUB-23',\n", + " 'text/part0196_split_000.html#x9781440631979_EPUB-24',\n", + " 'text/part0197.html#x9781440631979_EPUB-25',\n", + " 'text/part0198.html#x9781440631979_EPUB-26',\n", + " 'text/part0199.html#x9781440631979_EPUB-27',\n", + " 'text/part0200.html#x9781440631979_EPUB-28',\n", + " 'text/part0201.html#x9781440631979_EPUB-29',\n", + " 'text/part0202.html#x9781440631979_EPUB-30',\n", + " 'text/part0203.html#x9781440631979_EPUB-31',\n", + " 'text/part0204.html#x9781440631979_EPUB-32',\n", + " 'text/part0205.html#x9781440631979_EPUB-33',\n", + " 'text/part0206.html#x9781440631979_EPUB-34',\n", + " 'text/part0207.html#x9781440631979_EPUB-35',\n", + " 'text/part0208.html#x9781440631979_EPUB-36',\n", + " 'text/part0209.html#x9781440631979_EPUB-37',\n", + " 'text/part0210.html#x9781440631979_EPUB-38',\n", + " 'text/part0211.html#x9781440631979_EPUB-39',\n", + " 'text/part0212.html#x9781440631979_EPUB-40',\n", + " 'text/part0213.html#x9781440631979_EPUB-41',\n", + " 'text/part0214.html#x9781440631979_EPUB-42',\n", + " 'text/part0215.html#x9781440631979_EPUB-43',\n", + " 'text/part0216.html#x9781440631979_EPUB-44',\n", + " 'text/part0217.html#x9781440631979_EPUB-45',\n", + " 'text/part0218.html#x9781440631979_EPUB-46',\n", + " 'text/part0219.html#x9781440631979_EPUB-47',\n", + " 'text/part0220.html#x9781440631979_EPUB-48',\n", + " 'text/part0221.html#x9781440631979_EPUB-49',\n", + " 'text/part0222.html#x9781440631979_EPUB-50',\n", + " 'text/part0223.html#x9781440631979_EPUB-51',\n", + " 'text/part0224.html#x9781440631979_EPUB-52',\n", + " 'text/part0225.html#x9781440631979_EPUB-53',\n", + " 'text/part0226.html#x9781440631979_EPUB-54',\n", + " 'text/part0227.html#x9781440631979_EPUB-55',\n", + " 'text/part0228.html#x9781440631979_EPUB-56',\n", + " 'text/part0229.html#x9781440631979_EPUB-57',\n", + " 'text/part0230.html#x9781440631979_EPUB-58',\n", + " 'text/part0232.html#6T82G0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0233.html#x9781440619649_EPUB-2',\n", + " 'text/part0234.html#x9781440619649_EPUB-3',\n", + " 'text/part0235.html#703K60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0236.html#x9781440619649_EPUB-4',\n", + " 'text/part0237.html#x9781440619649_EPUB-5',\n", + " 'text/part0238.html#x9781440619649_EPUB-6',\n", + " 'text/part0239.html#x9781440619649_EPUB-7',\n", + " 'text/part0240.html#x9781440619649_EPUB-8',\n", + " 'text/part0241.html#x9781440619649_EPUB-9',\n", + " 'text/part0242.html#x9781440619649_EPUB-10',\n", + " 'text/part0243.html#x9781440619649_EPUB-11',\n", + " 'text/part0244.html#x9781440619649_EPUB-12',\n", + " 'text/part0245.html#x9781440619649_EPUB-13',\n", + " 'text/part0246.html#x9781440619649_EPUB-14',\n", + " 'text/part0247.html#x9781440619649_EPUB-15',\n", + " 'text/part0248.html#x9781440619649_EPUB-16',\n", + " 'text/part0249.html#x9781440619649_EPUB-17',\n", + " 'text/part0250.html#x9781440619649_EPUB-18',\n", + " 'text/part0251.html#x9781440619649_EPUB-19',\n", + " 'text/part0252.html#x9781440619649_EPUB-20',\n", + " 'text/part0253.html#x9781440619649_EPUB-21',\n", + " 'text/part0254.html#x9781440619649_EPUB-22',\n", + " 'text/part0255.html#x9781440619649_EPUB-23',\n", + " 'text/part0256.html#x9781440619649_EPUB-24',\n", + " 'text/part0257.html#x9781440619649_EPUB-25',\n", + " 'text/part0258.html#x9781440619649_EPUB-26',\n", + " 'text/part0259.html#x9781440619649_EPUB-27',\n", + " 'text/part0260.html#x9781440619649_EPUB-28',\n", + " 'text/part0261.html#x9781440619649_EPUB-29',\n", + " 'text/part0262.html#x9781440619649_EPUB-30',\n", + " 'text/part0263.html#x9781440619649_EPUB-31',\n", + " 'text/part0264.html#x9781440619649_EPUB-32',\n", + " 'text/part0265.html#x9781440619649_EPUB-33',\n", + " 'text/part0266.html#x9781440619649_EPUB-34',\n", + " 'text/part0267.html#x9781440619649_EPUB-35',\n", + " 'text/part0268.html#x9781440619649_EPUB-36',\n", + " 'text/part0269.html#x9781440619649_EPUB-37',\n", + " 'text/part0270.html#x9781440619649_EPUB-38',\n", + " 'text/part0271.html#x9781440619649_EPUB-39',\n", + " 'text/part0272.html#x9781440619649_EPUB-40',\n", + " 'text/part0273.html#x9781440619649_EPUB-41',\n", + " 'text/part0274.html#x9781440619649_EPUB-42',\n", + " 'text/part0275.html#x9781440619649_EPUB-43',\n", + " 'text/part0276.html#x9781440619649_EPUB-44',\n", + " 'text/part0277.html#x9781440619649_EPUB-45',\n", + " 'text/part0278.html#x9781440619649_EPUB-46',\n", + " 'text/part0279.html#x9781440619649_EPUB-47',\n", + " 'text/part0280.html#x9781440619649_EPUB-48',\n", + " 'text/part0281.html#x9781440619649_EPUB-49',\n", + " 'text/part0282.html#x9781440619649_EPUB-50',\n", + " 'text/part0283.html#x9781440619649_EPUB-51',\n", + " 'text/part0284.html#8EQVO0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0285.html#x9781440619236_EPUB-1',\n", + " 'text/part0286.html#x9781440619236_EPUB-2',\n", + " 'text/part0287.html#8HMHE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'text/part0288.html#x9781440619236_EPUB-3',\n", + " 'text/part0289.html#x9781440619236_EPUB-4',\n", + " 'text/part0290.html#x9781440619236_EPUB-5',\n", + " 'text/part0291.html#x9781440619236_EPUB-6',\n", + " 'text/part0292.html#x9781440619236_EPUB-7',\n", + " 'text/part0293.html#x9781440619236_EPUB-8',\n", + " 'text/part0294.html#x9781440619236_EPUB-9',\n", + " 'text/part0295.html#x9781440619236_EPUB-10',\n", + " 'text/part0296.html#x9781440619236_EPUB-11',\n", + " 'text/part0297.html#x9781440619236_EPUB-12',\n", + " 'text/part0298.html#x9781440619236_EPUB-13',\n", + " 'text/part0299.html#x9781440619236_EPUB-14',\n", + " 'text/part0300.html#x9781440619236_EPUB-15',\n", + " 'text/part0301.html#x9781440619236_EPUB-16',\n", + " 'text/part0302.html#x9781440619236_EPUB-17',\n", + " 'text/part0303.html#x9781440619236_EPUB-18',\n", + " 'text/part0304.html#x9781440619236_EPUB-19',\n", + " 'text/part0305.html#x9781440619236_EPUB-20',\n", + " 'text/part0306.html#x9781440619236_EPUB-21',\n", + " 'text/part0307.html#x9781440619236_EPUB-22',\n", + " 'text/part0308.html#x9781440619236_EPUB-23',\n", + " 'text/part0309.html#x9781440619236_EPUB-24',\n", + " 'text/part0310.html#x9781440619236_EPUB-25',\n", + " 'text/part0311.html#x9781440619236_EPUB-26',\n", + " 'text/part0312.html#x9781440619236_EPUB-27',\n", + " 'text/part0313.html#x9781440619236_EPUB-28',\n", + " 'text/part0314.html#x9781440619236_EPUB-29',\n", + " 'text/part0315.html#x9781440619236_EPUB-30',\n", + " 'text/part0316.html#x9781440619236_EPUB-31',\n", + " 'text/part0317.html#x9781440619236_EPUB-32',\n", + " 'text/part0318.html#x9781440619236_EPUB-33',\n", + " 'text/part0319.html#x9781440619236_EPUB-34',\n", + " 'text/part0320.html#x9781440619236_EPUB-35',\n", + " 'text/part0321.html#x9781440619236_EPUB-36',\n", + " 'text/part0322.html#x9781440619236_EPUB-37',\n", + " 'text/part0323.html#x9781440619236_EPUB-38',\n", + " 'text/part0324.html#x9781440619236_EPUB-39',\n", + " 'text/part0325.html#x9781440619236_EPUB-40',\n", + " 'text/part0326.html#x9781440619236_EPUB-41',\n", + " 'text/part0327.html#x9781440619236_EPUB-42',\n", + " 'text/part0328.html#x9781440619236_EPUB-43',\n", + " 'text/part0329.html#x9781440619236_EPUB-44',\n", + " 'text/part0330.html#x9781440619236_EPUB-45',\n", + " 'text/part0331.html#x9781440619236_EPUB-46',\n", + " 'text/part0332.html#x9781440619236_EPUB-47',\n", + " 'text/part0333.html#x9781440619236_EPUB-48',\n", + " 'text/part0334.html#x9781440619236_EPUB-49',\n", + " 'text/part0335.html#x9781440619236_EPUB-50',\n", + " 'text/part0336.html#x9781440619236_EPUB-51',\n", + " 'text/part0337.html#x9781440619236_EPUB-52',\n", + " 'text/part0338.html#A2AU40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'kindle:embed:0001?mime=image/jpg',\n", + " 'part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0172.html#5410O0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0232.html#6T82G0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0284.html#8EQVO0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'http://penguinrandomhouse.com',\n", + " 'part0009.html#8IL20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0010.html#9H5K0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0011.html#AFM60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0012.html#BE6O0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0013.html#CCNA0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0014.html#DB7S0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0015.html#E9OE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0016.html#F8900-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0017.html#G6PI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0018.html#H5A40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0019.html#I3QM0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0020.html#J2B80-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0021.html#K0RQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0022.html#KVCC0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0023.html#LTSU0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0024.html#MSDG0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0025.html#NQU20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0026.html#OPEK0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0027.html#PNV60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0028.html#QMFO0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0029.html#RL0A0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0030.html#SJGS0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0031.html#TI1E0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0032.html#UGI00-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0033.html#VF2I0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0034.html#10DJ40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0035.html#11C3M0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0036.html#12AK80-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0037.html#1394Q0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0038.html#147LC0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0039.html#1565U0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0040.html#164MG0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0041.html#173720-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0042.html#181NK0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0043.html#190860-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0044.html#19UOO0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0045.html#1AT9A0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0046.html#1BRPS0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0047.html#1CQAE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0048.html#1DOR00-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0049.html#1ENBI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0050.html#1FLS40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0051.html#1GKCM0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0052.html#1HIT80-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0053.html#1IHDQ0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0054.html#1JFUC0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0055.html#1KEEU0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0056.html#1LCVG0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0057.html#1MBG20-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0058.html#1NA0K0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0059.html#1O8H60-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0061.html#1Q5IA0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0062.html#1R42S0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0063.html#1S2JE0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0064.html#1T1400-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0065.html#1TVKI0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0066.html#1UU540-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0067.html#1VSLM0-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'part0068.html#20R680-9d088d4ffd7e4135ab987c63f18099d5',\n", + " None,\n", + " None,\n", + " 'http://penguinrandomhouse.com',\n", + " 'part0070.html#x9781101157879_EPUB-4',\n", + " 'part0071.html#x9781101157879_EPUB-2',\n", + " 'part0073.html#x9781101157879_EPUB-3',\n", + " 'part0074.html#x9781101157879_EPUB-5',\n", + " 'part0075.html#x9781101157879_EPUB-6',\n", + " 'part0076.html#x9781101157879_EPUB-7',\n", + " 'part0077.html#x9781101157879_EPUB-8',\n", + " 'part0078.html#x9781101157879_EPUB-9',\n", + " 'part0079.html#x9781101157879_EPUB-10',\n", + " 'part0080.html#x9781101157879_EPUB-11',\n", + " 'part0081.html#x9781101157879_EPUB-12',\n", + " 'part0082.html#x9781101157879_EPUB-13',\n", + " 'part0083.html#x9781101157879_EPUB-14',\n", + " 'part0084.html#x9781101157879_EPUB-15',\n", + " 'part0085.html#x9781101157879_EPUB-16',\n", + " 'part0086.html#x9781101157879_EPUB-17',\n", + " 'part0087.html#x9781101157879_EPUB-18',\n", + " 'part0088.html#x9781101157879_EPUB-19',\n", + " 'part0089.html#x9781101157879_EPUB-20',\n", + " 'part0090.html#x9781101157879_EPUB-21',\n", + " 'part0091.html#x9781101157879_EPUB-22',\n", + " 'part0092.html#x9781101157879_EPUB-23',\n", + " 'part0093.html#x9781101157879_EPUB-24',\n", + " 'part0094.html#x9781101157879_EPUB-25',\n", + " 'part0095.html#x9781101157879_EPUB-26',\n", + " 'part0096.html#x9781101157879_EPUB-27',\n", + " 'part0097.html#x9781101157879_EPUB-28',\n", + " 'part0098.html#x9781101157879_EPUB-29',\n", + " 'part0099.html#x9781101157879_EPUB-30',\n", + " 'part0100.html#footnote_1',\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " 'part0073.html#footnote-000-backlink',\n", + " None,\n", + " None,\n", + " 'http://penguinrandomhouse.com',\n", + " None,\n", + " 'part0102.html#x9781440630514_EPUB-2',\n", + " 'part0103.html#x9781440630514_EPUB-3',\n", + " 'part0104.html#x9781440630514_EPUB-4',\n", + " 'part0106.html#x9781440630514_EPUB-5',\n", + " 'part0107.html#x9781440630514_EPUB-6',\n", + " 'part0108.html#x9781440630514_EPUB-7',\n", + " 'part0109.html#x9781440630514_EPUB-8',\n", + " 'part0110.html#x9781440630514_EPUB-9',\n", + " 'part0111.html#x9781440630514_EPUB-10',\n", + " 'part0112.html#x9781440630514_EPUB-11',\n", + " 'part0113.html#x9781440630514_EPUB-12',\n", + " 'part0114.html#x9781440630514_EPUB-13',\n", + " 'part0115.html#x9781440630514_EPUB-14',\n", + " 'part0116.html#x9781440630514_EPUB-15',\n", + " 'part0117.html#x9781440630514_EPUB-16',\n", + " 'part0118.html#x9781440630514_EPUB-17',\n", + " 'part0119.html#x9781440630514_EPUB-18',\n", + " 'part0120.html#x9781440630514_EPUB-19',\n", + " 'part0121.html#x9781440630514_EPUB-20',\n", + " 'part0122.html#x9781440630514_EPUB-21',\n", + " 'part0123.html#x9781440630514_EPUB-22',\n", + " 'part0124.html#x9781440630514_EPUB-23',\n", + " 'part0125.html#x9781440630514_EPUB-24',\n", + " 'part0126.html#x9781440630514_EPUB-25',\n", + " 'part0127.html#x9781440630514_EPUB-26',\n", + " 'part0128.html#x9781440630514_EPUB-27',\n", + " 'part0129.html#x9781440630514_EPUB-28',\n", + " 'part0130.html#x9781440630514_EPUB-29',\n", + " 'part0131.html#x9781440630514_EPUB-30',\n", + " 'part0132.html#x9781440630514_EPUB-31',\n", + " 'part0133.html#x9781440630514_EPUB-32',\n", + " 'part0134.html#x9781440630514_EPUB-33',\n", + " 'part0135.html#x9781440630514_EPUB-34',\n", + " 'part0136.html#x9781440630514_EPUB-35',\n", + " 'part0137.html#x9781440630514_EPUB-36',\n", + " 'part0138.html#x9781440630514_EPUB-37',\n", + " 'part0139.html#x9781440630514_EPUB-38',\n", + " 'part0140.html#x9781440630514_EPUB-39',\n", + " 'part0141.html#x9781440630514_EPUB-40',\n", + " 'part0142.html#x9781440630514_EPUB-41',\n", + " 'part0143.html#x9781440630514_EPUB-42',\n", + " 'part0144.html#x9781440630514_EPUB-43',\n", + " 'part0145.html#x9781440630514_EPUB-44',\n", + " 'part0146.html#x9781440630514_EPUB-45',\n", + " 'part0147.html#x9781440630514_EPUB-46',\n", + " 'part0148.html#x9781440630514_EPUB-47',\n", + " 'part0149.html#x9781440630514_EPUB-48',\n", + " 'part0150.html#x9781440630514_EPUB-49',\n", + " 'part0151.html#x9781440630514_EPUB-50',\n", + " 'part0152.html#x9781440630514_EPUB-51',\n", + " 'part0153.html#x9781440630514_EPUB-52',\n", + " 'part0154.html#x9781440630514_EPUB-53',\n", + " 'part0155.html#x9781440630514_EPUB-54',\n", + " 'part0156.html#x9781440630514_EPUB-55',\n", + " 'part0157.html#x9781440630514_EPUB-56',\n", + " 'part0158.html#x9781440630514_EPUB-57',\n", + " 'part0159.html#x9781440630514_EPUB-58',\n", + " 'part0160.html#x9781440630514_EPUB-59',\n", + " 'part0161.html#x9781440630514_EPUB-60',\n", + " 'part0162.html#x9781440630514_EPUB-61',\n", + " 'part0163.html#x9781440630514_EPUB-62',\n", + " 'part0164.html#x9781440630514_EPUB-63',\n", + " 'part0165.html#x9781440630514_EPUB-64',\n", + " 'part0166.html#x9781440630514_EPUB-65',\n", + " 'part0167.html#x9781440630514_EPUB-66',\n", + " 'part0168.html#x9781440630514_EPUB-67',\n", + " 'part0169.html#x9781440630514_EPUB-68',\n", + " 'part0170.html#x9781440630514_EPUB-69',\n", + " 'part0171.html#x9781440630514_EPUB-70',\n", + " None,\n", + " None,\n", + " 'part0173.html#x9781440631979_EPUB-2',\n", + " 'part0174.html#x9781440631979_EPUB-3',\n", + " 'part0175.html#x9781440631979_EPUB-4',\n", + " 'part0177.html#x9781440631979_EPUB-5',\n", + " 'part0179.html#x9781440631979_EPUB-7',\n", + " 'part0182.html#x9781440631979_EPUB-10',\n", + " 'part0183.html#x9781440631979_EPUB-11',\n", + " 'part0184.html#x9781440631979_EPUB-12',\n", + " 'part0185.html#x9781440631979_EPUB-13',\n", + " 'part0186.html#x9781440631979_EPUB-14',\n", + " 'part0187_split_000.html#x9781440631979_EPUB-15',\n", + " 'part0188.html#x9781440631979_EPUB-16',\n", + " 'part0189_split_000.html#x9781440631979_EPUB-17',\n", + " 'part0190.html#x9781440631979_EPUB-18',\n", + " 'part0191.html#x9781440631979_EPUB-19',\n", + " 'part0192.html#x9781440631979_EPUB-20',\n", + " 'part0193.html#x9781440631979_EPUB-21',\n", + " 'part0194.html#x9781440631979_EPUB-22',\n", + " 'part0195.html#x9781440631979_EPUB-23',\n", + " 'part0196_split_000.html#x9781440631979_EPUB-24',\n", + " 'part0197.html#x9781440631979_EPUB-25',\n", + " 'part0198.html#x9781440631979_EPUB-26',\n", + " 'part0199.html#x9781440631979_EPUB-27',\n", + " 'part0200.html#x9781440631979_EPUB-28',\n", + " 'part0201.html#x9781440631979_EPUB-29',\n", + " 'part0202.html#x9781440631979_EPUB-30',\n", + " 'part0203.html#x9781440631979_EPUB-31',\n", + " 'part0204.html#x9781440631979_EPUB-32',\n", + " 'part0205.html#x9781440631979_EPUB-33',\n", + " 'part0206.html#x9781440631979_EPUB-34',\n", + " 'part0207.html#x9781440631979_EPUB-35',\n", + " 'part0208.html#x9781440631979_EPUB-36',\n", + " 'part0209.html#x9781440631979_EPUB-37',\n", + " 'part0210.html#x9781440631979_EPUB-38',\n", + " 'part0211.html#x9781440631979_EPUB-39',\n", + " 'part0212.html#x9781440631979_EPUB-40',\n", + " 'part0213.html#x9781440631979_EPUB-41',\n", + " 'part0214.html#x9781440631979_EPUB-42',\n", + " 'part0215.html#x9781440631979_EPUB-43',\n", + " 'part0216.html#x9781440631979_EPUB-44',\n", + " 'part0217.html#x9781440631979_EPUB-45',\n", + " 'part0218.html#x9781440631979_EPUB-46',\n", + " 'part0219.html#x9781440631979_EPUB-47',\n", + " 'part0220.html#x9781440631979_EPUB-48',\n", + " 'part0221.html#x9781440631979_EPUB-49',\n", + " 'part0222.html#x9781440631979_EPUB-50',\n", + " 'part0223.html#x9781440631979_EPUB-51',\n", + " 'part0224.html#x9781440631979_EPUB-52',\n", + " 'part0225.html#x9781440631979_EPUB-53',\n", + " 'part0226.html#x9781440631979_EPUB-54',\n", + " 'part0227.html#x9781440631979_EPUB-55',\n", + " 'part0228.html#x9781440631979_EPUB-56',\n", + " 'part0229.html#x9781440631979_EPUB-57',\n", + " 'part0230.html#x9781440631979_EPUB-58',\n", + " None,\n", + " None,\n", + " 'part0233.html#x9781440619649_EPUB-2',\n", + " 'part0234.html#x9781440619649_EPUB-3',\n", + " 'part0236.html#x9781440619649_EPUB-4',\n", + " 'part0237.html#x9781440619649_EPUB-5',\n", + " 'part0238.html#x9781440619649_EPUB-6',\n", + " 'part0239.html#x9781440619649_EPUB-7',\n", + " 'part0240.html#x9781440619649_EPUB-8',\n", + " 'part0241.html#x9781440619649_EPUB-9',\n", + " 'part0242.html#x9781440619649_EPUB-10',\n", + " 'part0243.html#x9781440619649_EPUB-11',\n", + " 'part0244.html#x9781440619649_EPUB-12',\n", + " 'part0245.html#x9781440619649_EPUB-13',\n", + " 'part0246.html#x9781440619649_EPUB-14',\n", + " 'part0247.html#x9781440619649_EPUB-15',\n", + " 'part0248.html#x9781440619649_EPUB-16',\n", + " 'part0249.html#x9781440619649_EPUB-17',\n", + " 'part0250.html#x9781440619649_EPUB-18',\n", + " 'part0251.html#x9781440619649_EPUB-19',\n", + " 'part0252.html#x9781440619649_EPUB-20',\n", + " 'part0253.html#x9781440619649_EPUB-21',\n", + " 'part0254.html#x9781440619649_EPUB-22',\n", + " 'part0255.html#x9781440619649_EPUB-23',\n", + " 'part0256.html#x9781440619649_EPUB-24',\n", + " 'part0257.html#x9781440619649_EPUB-25',\n", + " 'part0258.html#x9781440619649_EPUB-26',\n", + " 'part0259.html#x9781440619649_EPUB-27',\n", + " 'part0260.html#x9781440619649_EPUB-28',\n", + " 'part0261.html#x9781440619649_EPUB-29',\n", + " 'part0262.html#x9781440619649_EPUB-30',\n", + " 'part0263.html#x9781440619649_EPUB-31',\n", + " 'part0264.html#x9781440619649_EPUB-32',\n", + " 'part0265.html#x9781440619649_EPUB-33',\n", + " 'part0266.html#x9781440619649_EPUB-34',\n", + " 'part0267.html#x9781440619649_EPUB-35',\n", + " 'part0268.html#x9781440619649_EPUB-36',\n", + " 'part0269.html#x9781440619649_EPUB-37',\n", + " 'part0270.html#x9781440619649_EPUB-38',\n", + " 'part0271.html#x9781440619649_EPUB-39',\n", + " 'part0272.html#x9781440619649_EPUB-40',\n", + " 'part0273.html#x9781440619649_EPUB-41',\n", + " 'part0274.html#x9781440619649_EPUB-42',\n", + " 'part0275.html#x9781440619649_EPUB-43',\n", + " 'part0276.html#x9781440619649_EPUB-44',\n", + " 'part0277.html#x9781440619649_EPUB-45',\n", + " 'part0278.html#x9781440619649_EPUB-46',\n", + " 'part0279.html#x9781440619649_EPUB-47',\n", + " 'part0280.html#x9781440619649_EPUB-48',\n", + " 'part0281.html#x9781440619649_EPUB-49',\n", + " 'part0282.html#x9781440619649_EPUB-50',\n", + " 'part0283.html#x9781440619649_EPUB-51',\n", + " None,\n", + " None,\n", + " 'part0285.html#x9781440619236_EPUB-1',\n", + " 'part0286.html#x9781440619236_EPUB-2',\n", + " 'part0288.html#x9781440619236_EPUB-3',\n", + " 'part0289.html#x9781440619236_EPUB-4',\n", + " 'part0290.html#x9781440619236_EPUB-5',\n", + " 'part0291.html#x9781440619236_EPUB-6',\n", + " 'part0292.html#x9781440619236_EPUB-7',\n", + " 'part0293.html#x9781440619236_EPUB-8',\n", + " 'part0294.html#x9781440619236_EPUB-9',\n", + " 'part0295.html#x9781440619236_EPUB-10',\n", + " 'part0296.html#x9781440619236_EPUB-11',\n", + " 'part0297.html#x9781440619236_EPUB-12',\n", + " 'part0298.html#x9781440619236_EPUB-13',\n", + " 'part0299.html#x9781440619236_EPUB-14',\n", + " 'part0300.html#x9781440619236_EPUB-15',\n", + " 'part0301.html#x9781440619236_EPUB-16',\n", + " 'part0302.html#x9781440619236_EPUB-17',\n", + " 'part0303.html#x9781440619236_EPUB-18',\n", + " 'part0304.html#x9781440619236_EPUB-19',\n", + " 'part0305.html#x9781440619236_EPUB-20',\n", + " 'part0306.html#x9781440619236_EPUB-21',\n", + " 'part0307.html#x9781440619236_EPUB-22',\n", + " 'part0308.html#x9781440619236_EPUB-23',\n", + " 'part0309.html#x9781440619236_EPUB-24',\n", + " 'part0310.html#x9781440619236_EPUB-25',\n", + " 'part0311.html#x9781440619236_EPUB-26',\n", + " 'part0312.html#x9781440619236_EPUB-27',\n", + " 'part0313.html#x9781440619236_EPUB-28',\n", + " 'part0314.html#x9781440619236_EPUB-29',\n", + " 'part0315.html#x9781440619236_EPUB-30',\n", + " 'part0316.html#x9781440619236_EPUB-31',\n", + " 'part0317.html#x9781440619236_EPUB-32',\n", + " 'part0318.html#x9781440619236_EPUB-33',\n", + " 'part0319.html#x9781440619236_EPUB-34',\n", + " 'part0320.html#x9781440619236_EPUB-35',\n", + " 'part0321.html#x9781440619236_EPUB-36',\n", + " 'part0322.html#x9781440619236_EPUB-37',\n", + " 'part0323.html#x9781440619236_EPUB-38',\n", + " 'part0324.html#x9781440619236_EPUB-39',\n", + " 'part0325.html#x9781440619236_EPUB-40',\n", + " 'part0326.html#x9781440619236_EPUB-41',\n", + " 'part0327.html#x9781440619236_EPUB-42',\n", + " 'part0328.html#x9781440619236_EPUB-43',\n", + " 'part0329.html#x9781440619236_EPUB-44',\n", + " 'part0330.html#x9781440619236_EPUB-45',\n", + " 'part0331.html#x9781440619236_EPUB-46',\n", + " 'part0332.html#x9781440619236_EPUB-47',\n", + " 'part0333.html#x9781440619236_EPUB-48',\n", + " 'part0334.html#x9781440619236_EPUB-49',\n", + " 'part0335.html#x9781440619236_EPUB-50',\n", + " 'part0336.html#x9781440619236_EPUB-51',\n", + " 'part0337.html#x9781440619236_EPUB-52',\n", + " 'part0338.html#A2AU40-9d088d4ffd7e4135ab987c63f18099d5',\n", + " 'http://links.penguinrandomhouse.com/type/prhebooklanding/isbn/9780593333020/display/1',\n", + " 'http://links.penguinrandomhouse.com/type/prhebooklanding/isbn/9780593333020/display/2']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "href_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we see that valid hrefs begin with either 'text' or 'part'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# start to end processing" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "output_doc = BeautifulSoup()\n", + "output_doc.append(output_doc.new_tag(\"html\"))\n", + "output_doc.html.append(output_doc.new_tag(\"body\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new head tag\n", + "head_tag = output_doc.new_tag('head')\n", + "# Create a link tag for the stylesheet\n", + "link_tag = output_doc.new_tag('link', rel='stylesheet', type='text/css', href='style.css')\n", + "# Append the link tag to the head tag\n", + "head_tag.append(link_tag)\n", + "# Insert the head tag into the HTML document\n", + "if output_doc.head:\n", + " output_doc.head.insert_before(head_tag)\n", + "else:\n", + " output_doc.insert(0, head_tag)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "for file in html_files:\n", + " with open(file, 'r') as html_file:\n", + " soup = BeautifulSoup(html_file, \"html.parser\")\n", + " body_content = soup.find('body')\n", + " # the trick to preserve the id is to hide the body in a div with the same id\n", + " div = output_doc.new_tag('div', id=body_content.get('id'))\n", + " div.append(body_content.extract())\n", + " output_doc.append(div)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# strip the .html file name and preserve the id string, prepend with # to tell it is a href\n", + "for link in output_doc.find_all('a'):\n", + " href = link.get('href')\n", + " if href and href.startswith('text'):\n", + " # Update the link to point to the correct section within the merged document\n", + " index = href.find('#')\n", + " link['href'] = f'{href[index:]}'\n", + "\n", + " if href and href.startswith('part'):\n", + " # Update the link to point to the correct section within the merged document\n", + " index = href.find('#')\n", + " link['href'] = f'{href[index:]}'" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"output.html\", \"w\", encoding='utf-8') as file:\n", + " file.write(str(output_doc.prettify()))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}