epub_to_html/bs4_approach.ipynb

1136 lines
52 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# process epub htmls into a single html \n",
"\n",
"Goal: I want to merge html files from an epub into a single html document. I also want working links, so I need to ensure that the hrefs and relevant tag id's are defined properly to work within the document"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# investigate each one html document links and hrefs\n",
"\n",
"I have already unzipped an epub document into the folder `epub` in the same directory"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# collect all html files in outer and inner directories\n",
"directory = './epub'\n",
"outer_html_files = glob.glob(os.path.join(directory, '*.xhtml'))\n",
"\n",
"inner_directory = './epub/text'\n",
"inner_html_files = glob.glob(os.path.join(inner_directory, '*.html'))\n",
"\n",
"# the ordering depends heavily on what you have\n",
"outer_html_files = outer_html_files[::-1]\n",
"inner_html_files = sorted(inner_html_files)\n",
"html_files = outer_html_files + inner_html_files"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# let us look at a sample html\n",
"with open(html_files[2], 'r', encoding='utf-8') as file:\n",
" soup = BeautifulSoup(file, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<body class=\"calibre\" id=\"0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" <div class=\"titlehead\">Frank Herbert's Dune Saga Collection: Books 1 - 6</div>\n",
" <div class=\"titletext\">Dune</div>\n",
" <div class=\"titletext\">Dune Messiah</div>\n",
" <div class=\"titletext\">Children of Dune</div>\n",
" <div class=\"titletext\">God Emperor of Dune</div>\n",
" <div class=\"titletext\">Heretics of Dune</div>\n",
" <div class=\"titletext\">Chapterhouse: Dune</div>\n",
" <div class=\"titleauthor\">Frank Herbert</div>\n",
" </body>]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get all tags\n",
"soup.find_all(id=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0-9d088d4ffd7e4135ab987c63f18099d5'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the id of the body\n",
"soup.find('body').get('id')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"output_doc = BeautifulSoup()\n",
"output_doc.append(output_doc.new_tag(\"html\"))\n",
"output_doc.html.append(output_doc.new_tag(\"body\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Problem: when just getting appending the body, all the body tags get consolidated into one large outer body tag, so the id's disappear.\n",
"\n",
"The solution is to create a new div for each html file, then append the body to the div and then append the div into the output document"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# read 2 files\n",
"for file in html_files[3:4+1]:\n",
" with open(html_files[3], 'r', encoding='utf-8') as file:\n",
" soup = BeautifulSoup(file, 'html.parser')\n",
" body_content = soup.find('body')\n",
" # the trick to preserve the body tag is to hide the body in a div\n",
" div = output_doc.new_tag('div', id=(body_content.get('id')))\n",
" div.append(body_content.extract())\n",
" output_doc.append(div)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<html>\n",
" <body>\n",
" </body>\n",
"</html>\n",
"<div id=\"UGI0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" <body class=\"calibre\" id=\"UGI0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" <h1 class=\"contents-head\" id=\"calibre_pb_0\">\n",
" Table of Contents\n",
" </h1>\n",
" <p class=\"contents-fl\">\n",
" <i class=\"calibre1\">\n",
" <a class=\"calibre2\" href=\"kindle:embed:0001?mime=image/jpg\">\n",
" Cover\n",
" </a>\n",
" </i>\n",
" </p>\n",
" <p class=\"contents-fl\">\n",
" <i class=\"calibre1\">\n",
" <a class=\"calibre2\" href=\"part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Title Page\n",
" </a>\n",
" </i>\n",
" </p>\n",
" <p class=\"contents-fl\">\n",
" </p>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Dune Messiah\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Children of Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0172.html#5410O0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" God Emperor of Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0232.html#6T82G0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Heretics of Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0284.html#8EQVO0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Chapterhouse: Dune\n",
" </a>\n",
" </div>\n",
" </body>\n",
"</div>\n",
"<div id=\"UGI0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" <body class=\"calibre\" id=\"UGI0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" <h1 class=\"contents-head\" id=\"calibre_pb_0\">\n",
" Table of Contents\n",
" </h1>\n",
" <p class=\"contents-fl\">\n",
" <i class=\"calibre1\">\n",
" <a class=\"calibre2\" href=\"kindle:embed:0001?mime=image/jpg\">\n",
" Cover\n",
" </a>\n",
" </i>\n",
" </p>\n",
" <p class=\"contents-fl\">\n",
" <i class=\"calibre1\">\n",
" <a class=\"calibre2\" href=\"part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Title Page\n",
" </a>\n",
" </i>\n",
" </p>\n",
" <p class=\"contents-fl\">\n",
" </p>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Dune Messiah\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Children of Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0172.html#5410O0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" God Emperor of Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0232.html#6T82G0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Heretics of Dune\n",
" </a>\n",
" </div>\n",
" <div class=\"contents-fl\">\n",
" <a class=\"calibre2\" href=\"part0284.html#8EQVO0-9d088d4ffd7e4135ab987c63f18099d5\">\n",
" Chapterhouse: Dune\n",
" </a>\n",
" </div>\n",
" </body>\n",
"</div>\n",
"\n"
]
}
],
"source": [
"print(output_doc.prettify())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice how the `body` is preserved from each html file\n",
"\n",
"The available id with the original id name allows hrefs to reach this section of the document.\n",
"\n",
"However, notice that the hrefs have the name \"partX.html#id\".\n",
"\n",
"Let's try to find all the href variants"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"output_doc = BeautifulSoup()\n",
"output_doc.append(output_doc.new_tag(\"html\"))\n",
"output_doc.html.append(output_doc.new_tag(\"body\"))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"for file in html_files:\n",
" with open(file, 'r') as html_file:\n",
" soup = BeautifulSoup(html_file, \"html.parser\")\n",
" body_content = soup.find('body')\n",
" # the trick to preserve the id is to hide the body in a div with the same id\n",
" div = output_doc.new_tag('div', id=body_content.get('id'))\n",
" div.append(body_content.extract())\n",
" output_doc.append(div)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"href_list = [ link.get('href') for link in output_doc.find_all('a')]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['text/part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0001.html#UGI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0002.html#x9781101157879_EPUB',\n",
" 'text/part0003_split_000.html#x9781101157879_EPUB-1',\n",
" 'text/part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0005.html#4OIQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0006.html#5N3C0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0007.html#6LJU0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0008.html#7K4G0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0009.html#8IL20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0010.html#9H5K0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0011.html#AFM60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0012.html#BE6O0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0013.html#CCNA0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0014.html#DB7S0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0015.html#E9OE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0016.html#F8900-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0017.html#G6PI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0018.html#H5A40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0019.html#I3QM0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0020.html#J2B80-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0021.html#K0RQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0022.html#KVCC0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0023.html#LTSU0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0024.html#MSDG0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0025.html#NQU20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0026.html#OPEK0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0027.html#PNV60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0028.html#QMFO0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0029.html#RL0A0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0030.html#SJGS0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0031.html#TI1E0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0032.html#UGI00-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0033.html#VF2I0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0034.html#10DJ40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0035.html#11C3M0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0036.html#12AK80-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0037.html#1394Q0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0038.html#147LC0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0039.html#1565U0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0040.html#164MG0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0041.html#173720-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0042.html#181NK0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0043.html#190860-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0044.html#19UOO0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0045.html#1AT9A0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0046.html#1BRPS0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0047.html#1CQAE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0048.html#1DOR00-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0049.html#1ENBI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0050.html#1FLS40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0051.html#1GKCM0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0052.html#1HIT80-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0053.html#1IHDQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0054.html#1JFUC0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0055.html#1KEEU0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0056.html#1LCVG0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0057.html#1MBG20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0058.html#1NA0K0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0059.html#1O8H60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0061.html#1Q5IA0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0062.html#1R42S0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0063.html#1S2JE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0064.html#1T1400-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0065.html#1TVKI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0066.html#1UU540-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0067.html#1VSLM0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0068.html#20R680-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0070.html#x9781101157879_EPUB-4',\n",
" 'text/part0071.html#x9781101157879_EPUB-2',\n",
" 'text/part0072.html#24L8G0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0073.html#x9781101157879_EPUB-3',\n",
" 'text/part0074.html#x9781101157879_EPUB-5',\n",
" 'text/part0075.html#x9781101157879_EPUB-6',\n",
" 'text/part0076.html#x9781101157879_EPUB-7',\n",
" 'text/part0077.html#x9781101157879_EPUB-8',\n",
" 'text/part0078.html#x9781101157879_EPUB-9',\n",
" 'text/part0079.html#x9781101157879_EPUB-10',\n",
" 'text/part0080.html#x9781101157879_EPUB-11',\n",
" 'text/part0081.html#x9781101157879_EPUB-12',\n",
" 'text/part0082.html#x9781101157879_EPUB-13',\n",
" 'text/part0083.html#x9781101157879_EPUB-14',\n",
" 'text/part0084.html#x9781101157879_EPUB-15',\n",
" 'text/part0085.html#x9781101157879_EPUB-16',\n",
" 'text/part0086.html#x9781101157879_EPUB-17',\n",
" 'text/part0087.html#x9781101157879_EPUB-18',\n",
" 'text/part0088.html#x9781101157879_EPUB-19',\n",
" 'text/part0089.html#x9781101157879_EPUB-20',\n",
" 'text/part0090.html#x9781101157879_EPUB-21',\n",
" 'text/part0091.html#x9781101157879_EPUB-22',\n",
" 'text/part0092.html#x9781101157879_EPUB-23',\n",
" 'text/part0093.html#x9781101157879_EPUB-24',\n",
" 'text/part0094.html#x9781101157879_EPUB-25',\n",
" 'text/part0095.html#x9781101157879_EPUB-26',\n",
" 'text/part0096.html#x9781101157879_EPUB-27',\n",
" 'text/part0097.html#x9781101157879_EPUB-28',\n",
" 'text/part0098.html#x9781101157879_EPUB-29',\n",
" 'text/part0099.html#x9781101157879_EPUB-30',\n",
" 'text/part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0102.html#x9781440630514_EPUB-2',\n",
" 'text/part0103.html#x9781440630514_EPUB-3',\n",
" 'text/part0104.html#x9781440630514_EPUB-4',\n",
" 'text/part0105.html#344B20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0106.html#x9781440630514_EPUB-5',\n",
" 'text/part0107.html#x9781440630514_EPUB-6',\n",
" 'text/part0108.html#x9781440630514_EPUB-7',\n",
" 'text/part0109.html#x9781440630514_EPUB-8',\n",
" 'text/part0110.html#x9781440630514_EPUB-9',\n",
" 'text/part0111.html#x9781440630514_EPUB-10',\n",
" 'text/part0112.html#x9781440630514_EPUB-11',\n",
" 'text/part0113.html#x9781440630514_EPUB-12',\n",
" 'text/part0114.html#x9781440630514_EPUB-13',\n",
" 'text/part0115.html#x9781440630514_EPUB-14',\n",
" 'text/part0116.html#x9781440630514_EPUB-15',\n",
" 'text/part0117.html#x9781440630514_EPUB-16',\n",
" 'text/part0118.html#x9781440630514_EPUB-17',\n",
" 'text/part0119.html#x9781440630514_EPUB-18',\n",
" 'text/part0120.html#x9781440630514_EPUB-19',\n",
" 'text/part0121.html#x9781440630514_EPUB-20',\n",
" 'text/part0122.html#x9781440630514_EPUB-21',\n",
" 'text/part0123.html#x9781440630514_EPUB-22',\n",
" 'text/part0124.html#x9781440630514_EPUB-23',\n",
" 'text/part0125.html#x9781440630514_EPUB-24',\n",
" 'text/part0126.html#x9781440630514_EPUB-25',\n",
" 'text/part0127.html#x9781440630514_EPUB-26',\n",
" 'text/part0128.html#x9781440630514_EPUB-27',\n",
" 'text/part0129.html#x9781440630514_EPUB-28',\n",
" 'text/part0130.html#x9781440630514_EPUB-29',\n",
" 'text/part0131.html#x9781440630514_EPUB-30',\n",
" 'text/part0132.html#x9781440630514_EPUB-31',\n",
" 'text/part0133.html#x9781440630514_EPUB-32',\n",
" 'text/part0134.html#x9781440630514_EPUB-33',\n",
" 'text/part0135.html#x9781440630514_EPUB-34',\n",
" 'text/part0136.html#x9781440630514_EPUB-35',\n",
" 'text/part0137.html#x9781440630514_EPUB-36',\n",
" 'text/part0138.html#x9781440630514_EPUB-37',\n",
" 'text/part0139.html#x9781440630514_EPUB-38',\n",
" 'text/part0140.html#x9781440630514_EPUB-39',\n",
" 'text/part0141.html#x9781440630514_EPUB-40',\n",
" 'text/part0142.html#x9781440630514_EPUB-41',\n",
" 'text/part0143.html#x9781440630514_EPUB-42',\n",
" 'text/part0144.html#x9781440630514_EPUB-43',\n",
" 'text/part0145.html#x9781440630514_EPUB-44',\n",
" 'text/part0146.html#x9781440630514_EPUB-45',\n",
" 'text/part0147.html#x9781440630514_EPUB-46',\n",
" 'text/part0148.html#x9781440630514_EPUB-47',\n",
" 'text/part0149.html#x9781440630514_EPUB-48',\n",
" 'text/part0150.html#x9781440630514_EPUB-49',\n",
" 'text/part0151.html#x9781440630514_EPUB-50',\n",
" 'text/part0152.html#x9781440630514_EPUB-51',\n",
" 'text/part0153.html#x9781440630514_EPUB-52',\n",
" 'text/part0154.html#x9781440630514_EPUB-53',\n",
" 'text/part0155.html#x9781440630514_EPUB-54',\n",
" 'text/part0156.html#x9781440630514_EPUB-55',\n",
" 'text/part0157.html#x9781440630514_EPUB-56',\n",
" 'text/part0158.html#x9781440630514_EPUB-57',\n",
" 'text/part0159.html#x9781440630514_EPUB-58',\n",
" 'text/part0160.html#x9781440630514_EPUB-59',\n",
" 'text/part0161.html#x9781440630514_EPUB-60',\n",
" 'text/part0162.html#x9781440630514_EPUB-61',\n",
" 'text/part0163.html#x9781440630514_EPUB-62',\n",
" 'text/part0164.html#x9781440630514_EPUB-63',\n",
" 'text/part0165.html#x9781440630514_EPUB-64',\n",
" 'text/part0166.html#x9781440630514_EPUB-65',\n",
" 'text/part0167.html#x9781440630514_EPUB-66',\n",
" 'text/part0168.html#x9781440630514_EPUB-67',\n",
" 'text/part0169.html#x9781440630514_EPUB-68',\n",
" 'text/part0170.html#x9781440630514_EPUB-69',\n",
" 'text/part0171.html#x9781440630514_EPUB-70',\n",
" 'text/part0172.html#5410O0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0173.html#x9781440631979_EPUB-2',\n",
" 'text/part0174.html#x9781440631979_EPUB-3',\n",
" 'text/part0175.html#x9781440631979_EPUB-4',\n",
" 'text/part0176.html#57R300-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0177.html#x9781440631979_EPUB-5',\n",
" 'text/part0179.html#x9781440631979_EPUB-7',\n",
" 'text/part0182.html#x9781440631979_EPUB-10',\n",
" 'text/part0183.html#x9781440631979_EPUB-11',\n",
" 'text/part0184.html#x9781440631979_EPUB-12',\n",
" 'text/part0185.html#x9781440631979_EPUB-13',\n",
" 'text/part0186.html#x9781440631979_EPUB-14',\n",
" 'text/part0187_split_000.html#x9781440631979_EPUB-15',\n",
" 'text/part0188.html#x9781440631979_EPUB-16',\n",
" 'text/part0189_split_000.html#x9781440631979_EPUB-17',\n",
" 'text/part0190.html#x9781440631979_EPUB-18',\n",
" 'text/part0191.html#x9781440631979_EPUB-19',\n",
" 'text/part0192.html#x9781440631979_EPUB-20',\n",
" 'text/part0193.html#x9781440631979_EPUB-21',\n",
" 'text/part0194.html#x9781440631979_EPUB-22',\n",
" 'text/part0195.html#x9781440631979_EPUB-23',\n",
" 'text/part0196_split_000.html#x9781440631979_EPUB-24',\n",
" 'text/part0197.html#x9781440631979_EPUB-25',\n",
" 'text/part0198.html#x9781440631979_EPUB-26',\n",
" 'text/part0199.html#x9781440631979_EPUB-27',\n",
" 'text/part0200.html#x9781440631979_EPUB-28',\n",
" 'text/part0201.html#x9781440631979_EPUB-29',\n",
" 'text/part0202.html#x9781440631979_EPUB-30',\n",
" 'text/part0203.html#x9781440631979_EPUB-31',\n",
" 'text/part0204.html#x9781440631979_EPUB-32',\n",
" 'text/part0205.html#x9781440631979_EPUB-33',\n",
" 'text/part0206.html#x9781440631979_EPUB-34',\n",
" 'text/part0207.html#x9781440631979_EPUB-35',\n",
" 'text/part0208.html#x9781440631979_EPUB-36',\n",
" 'text/part0209.html#x9781440631979_EPUB-37',\n",
" 'text/part0210.html#x9781440631979_EPUB-38',\n",
" 'text/part0211.html#x9781440631979_EPUB-39',\n",
" 'text/part0212.html#x9781440631979_EPUB-40',\n",
" 'text/part0213.html#x9781440631979_EPUB-41',\n",
" 'text/part0214.html#x9781440631979_EPUB-42',\n",
" 'text/part0215.html#x9781440631979_EPUB-43',\n",
" 'text/part0216.html#x9781440631979_EPUB-44',\n",
" 'text/part0217.html#x9781440631979_EPUB-45',\n",
" 'text/part0218.html#x9781440631979_EPUB-46',\n",
" 'text/part0219.html#x9781440631979_EPUB-47',\n",
" 'text/part0220.html#x9781440631979_EPUB-48',\n",
" 'text/part0221.html#x9781440631979_EPUB-49',\n",
" 'text/part0222.html#x9781440631979_EPUB-50',\n",
" 'text/part0223.html#x9781440631979_EPUB-51',\n",
" 'text/part0224.html#x9781440631979_EPUB-52',\n",
" 'text/part0225.html#x9781440631979_EPUB-53',\n",
" 'text/part0226.html#x9781440631979_EPUB-54',\n",
" 'text/part0227.html#x9781440631979_EPUB-55',\n",
" 'text/part0228.html#x9781440631979_EPUB-56',\n",
" 'text/part0229.html#x9781440631979_EPUB-57',\n",
" 'text/part0230.html#x9781440631979_EPUB-58',\n",
" 'text/part0232.html#6T82G0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0233.html#x9781440619649_EPUB-2',\n",
" 'text/part0234.html#x9781440619649_EPUB-3',\n",
" 'text/part0235.html#703K60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0236.html#x9781440619649_EPUB-4',\n",
" 'text/part0237.html#x9781440619649_EPUB-5',\n",
" 'text/part0238.html#x9781440619649_EPUB-6',\n",
" 'text/part0239.html#x9781440619649_EPUB-7',\n",
" 'text/part0240.html#x9781440619649_EPUB-8',\n",
" 'text/part0241.html#x9781440619649_EPUB-9',\n",
" 'text/part0242.html#x9781440619649_EPUB-10',\n",
" 'text/part0243.html#x9781440619649_EPUB-11',\n",
" 'text/part0244.html#x9781440619649_EPUB-12',\n",
" 'text/part0245.html#x9781440619649_EPUB-13',\n",
" 'text/part0246.html#x9781440619649_EPUB-14',\n",
" 'text/part0247.html#x9781440619649_EPUB-15',\n",
" 'text/part0248.html#x9781440619649_EPUB-16',\n",
" 'text/part0249.html#x9781440619649_EPUB-17',\n",
" 'text/part0250.html#x9781440619649_EPUB-18',\n",
" 'text/part0251.html#x9781440619649_EPUB-19',\n",
" 'text/part0252.html#x9781440619649_EPUB-20',\n",
" 'text/part0253.html#x9781440619649_EPUB-21',\n",
" 'text/part0254.html#x9781440619649_EPUB-22',\n",
" 'text/part0255.html#x9781440619649_EPUB-23',\n",
" 'text/part0256.html#x9781440619649_EPUB-24',\n",
" 'text/part0257.html#x9781440619649_EPUB-25',\n",
" 'text/part0258.html#x9781440619649_EPUB-26',\n",
" 'text/part0259.html#x9781440619649_EPUB-27',\n",
" 'text/part0260.html#x9781440619649_EPUB-28',\n",
" 'text/part0261.html#x9781440619649_EPUB-29',\n",
" 'text/part0262.html#x9781440619649_EPUB-30',\n",
" 'text/part0263.html#x9781440619649_EPUB-31',\n",
" 'text/part0264.html#x9781440619649_EPUB-32',\n",
" 'text/part0265.html#x9781440619649_EPUB-33',\n",
" 'text/part0266.html#x9781440619649_EPUB-34',\n",
" 'text/part0267.html#x9781440619649_EPUB-35',\n",
" 'text/part0268.html#x9781440619649_EPUB-36',\n",
" 'text/part0269.html#x9781440619649_EPUB-37',\n",
" 'text/part0270.html#x9781440619649_EPUB-38',\n",
" 'text/part0271.html#x9781440619649_EPUB-39',\n",
" 'text/part0272.html#x9781440619649_EPUB-40',\n",
" 'text/part0273.html#x9781440619649_EPUB-41',\n",
" 'text/part0274.html#x9781440619649_EPUB-42',\n",
" 'text/part0275.html#x9781440619649_EPUB-43',\n",
" 'text/part0276.html#x9781440619649_EPUB-44',\n",
" 'text/part0277.html#x9781440619649_EPUB-45',\n",
" 'text/part0278.html#x9781440619649_EPUB-46',\n",
" 'text/part0279.html#x9781440619649_EPUB-47',\n",
" 'text/part0280.html#x9781440619649_EPUB-48',\n",
" 'text/part0281.html#x9781440619649_EPUB-49',\n",
" 'text/part0282.html#x9781440619649_EPUB-50',\n",
" 'text/part0283.html#x9781440619649_EPUB-51',\n",
" 'text/part0284.html#8EQVO0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0285.html#x9781440619236_EPUB-1',\n",
" 'text/part0286.html#x9781440619236_EPUB-2',\n",
" 'text/part0287.html#8HMHE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'text/part0288.html#x9781440619236_EPUB-3',\n",
" 'text/part0289.html#x9781440619236_EPUB-4',\n",
" 'text/part0290.html#x9781440619236_EPUB-5',\n",
" 'text/part0291.html#x9781440619236_EPUB-6',\n",
" 'text/part0292.html#x9781440619236_EPUB-7',\n",
" 'text/part0293.html#x9781440619236_EPUB-8',\n",
" 'text/part0294.html#x9781440619236_EPUB-9',\n",
" 'text/part0295.html#x9781440619236_EPUB-10',\n",
" 'text/part0296.html#x9781440619236_EPUB-11',\n",
" 'text/part0297.html#x9781440619236_EPUB-12',\n",
" 'text/part0298.html#x9781440619236_EPUB-13',\n",
" 'text/part0299.html#x9781440619236_EPUB-14',\n",
" 'text/part0300.html#x9781440619236_EPUB-15',\n",
" 'text/part0301.html#x9781440619236_EPUB-16',\n",
" 'text/part0302.html#x9781440619236_EPUB-17',\n",
" 'text/part0303.html#x9781440619236_EPUB-18',\n",
" 'text/part0304.html#x9781440619236_EPUB-19',\n",
" 'text/part0305.html#x9781440619236_EPUB-20',\n",
" 'text/part0306.html#x9781440619236_EPUB-21',\n",
" 'text/part0307.html#x9781440619236_EPUB-22',\n",
" 'text/part0308.html#x9781440619236_EPUB-23',\n",
" 'text/part0309.html#x9781440619236_EPUB-24',\n",
" 'text/part0310.html#x9781440619236_EPUB-25',\n",
" 'text/part0311.html#x9781440619236_EPUB-26',\n",
" 'text/part0312.html#x9781440619236_EPUB-27',\n",
" 'text/part0313.html#x9781440619236_EPUB-28',\n",
" 'text/part0314.html#x9781440619236_EPUB-29',\n",
" 'text/part0315.html#x9781440619236_EPUB-30',\n",
" 'text/part0316.html#x9781440619236_EPUB-31',\n",
" 'text/part0317.html#x9781440619236_EPUB-32',\n",
" 'text/part0318.html#x9781440619236_EPUB-33',\n",
" 'text/part0319.html#x9781440619236_EPUB-34',\n",
" 'text/part0320.html#x9781440619236_EPUB-35',\n",
" 'text/part0321.html#x9781440619236_EPUB-36',\n",
" 'text/part0322.html#x9781440619236_EPUB-37',\n",
" 'text/part0323.html#x9781440619236_EPUB-38',\n",
" 'text/part0324.html#x9781440619236_EPUB-39',\n",
" 'text/part0325.html#x9781440619236_EPUB-40',\n",
" 'text/part0326.html#x9781440619236_EPUB-41',\n",
" 'text/part0327.html#x9781440619236_EPUB-42',\n",
" 'text/part0328.html#x9781440619236_EPUB-43',\n",
" 'text/part0329.html#x9781440619236_EPUB-44',\n",
" 'text/part0330.html#x9781440619236_EPUB-45',\n",
" 'text/part0331.html#x9781440619236_EPUB-46',\n",
" 'text/part0332.html#x9781440619236_EPUB-47',\n",
" 'text/part0333.html#x9781440619236_EPUB-48',\n",
" 'text/part0334.html#x9781440619236_EPUB-49',\n",
" 'text/part0335.html#x9781440619236_EPUB-50',\n",
" 'text/part0336.html#x9781440619236_EPUB-51',\n",
" 'text/part0337.html#x9781440619236_EPUB-52',\n",
" 'text/part0338.html#A2AU40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'kindle:embed:0001?mime=image/jpg',\n",
" 'part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0172.html#5410O0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0232.html#6T82G0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0284.html#8EQVO0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'http://penguinrandomhouse.com',\n",
" 'part0009.html#8IL20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0010.html#9H5K0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0011.html#AFM60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0012.html#BE6O0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0013.html#CCNA0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0014.html#DB7S0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0015.html#E9OE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0016.html#F8900-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0017.html#G6PI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0018.html#H5A40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0019.html#I3QM0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0020.html#J2B80-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0021.html#K0RQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0022.html#KVCC0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0023.html#LTSU0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0024.html#MSDG0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0025.html#NQU20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0026.html#OPEK0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0027.html#PNV60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0028.html#QMFO0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0029.html#RL0A0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0030.html#SJGS0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0031.html#TI1E0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0032.html#UGI00-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0033.html#VF2I0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0034.html#10DJ40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0035.html#11C3M0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0036.html#12AK80-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0037.html#1394Q0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0038.html#147LC0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0039.html#1565U0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0040.html#164MG0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0041.html#173720-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0042.html#181NK0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0043.html#190860-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0044.html#19UOO0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0045.html#1AT9A0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0046.html#1BRPS0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0047.html#1CQAE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0048.html#1DOR00-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0049.html#1ENBI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0050.html#1FLS40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0051.html#1GKCM0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0052.html#1HIT80-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0053.html#1IHDQ0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0054.html#1JFUC0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0055.html#1KEEU0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0056.html#1LCVG0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0057.html#1MBG20-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0058.html#1NA0K0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0059.html#1O8H60-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0061.html#1Q5IA0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0062.html#1R42S0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0063.html#1S2JE0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0064.html#1T1400-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0065.html#1TVKI0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0066.html#1UU540-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0067.html#1VSLM0-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'part0068.html#20R680-9d088d4ffd7e4135ab987c63f18099d5',\n",
" None,\n",
" None,\n",
" 'http://penguinrandomhouse.com',\n",
" 'part0070.html#x9781101157879_EPUB-4',\n",
" 'part0071.html#x9781101157879_EPUB-2',\n",
" 'part0073.html#x9781101157879_EPUB-3',\n",
" 'part0074.html#x9781101157879_EPUB-5',\n",
" 'part0075.html#x9781101157879_EPUB-6',\n",
" 'part0076.html#x9781101157879_EPUB-7',\n",
" 'part0077.html#x9781101157879_EPUB-8',\n",
" 'part0078.html#x9781101157879_EPUB-9',\n",
" 'part0079.html#x9781101157879_EPUB-10',\n",
" 'part0080.html#x9781101157879_EPUB-11',\n",
" 'part0081.html#x9781101157879_EPUB-12',\n",
" 'part0082.html#x9781101157879_EPUB-13',\n",
" 'part0083.html#x9781101157879_EPUB-14',\n",
" 'part0084.html#x9781101157879_EPUB-15',\n",
" 'part0085.html#x9781101157879_EPUB-16',\n",
" 'part0086.html#x9781101157879_EPUB-17',\n",
" 'part0087.html#x9781101157879_EPUB-18',\n",
" 'part0088.html#x9781101157879_EPUB-19',\n",
" 'part0089.html#x9781101157879_EPUB-20',\n",
" 'part0090.html#x9781101157879_EPUB-21',\n",
" 'part0091.html#x9781101157879_EPUB-22',\n",
" 'part0092.html#x9781101157879_EPUB-23',\n",
" 'part0093.html#x9781101157879_EPUB-24',\n",
" 'part0094.html#x9781101157879_EPUB-25',\n",
" 'part0095.html#x9781101157879_EPUB-26',\n",
" 'part0096.html#x9781101157879_EPUB-27',\n",
" 'part0097.html#x9781101157879_EPUB-28',\n",
" 'part0098.html#x9781101157879_EPUB-29',\n",
" 'part0099.html#x9781101157879_EPUB-30',\n",
" 'part0100.html#footnote_1',\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" 'part0073.html#footnote-000-backlink',\n",
" None,\n",
" None,\n",
" 'http://penguinrandomhouse.com',\n",
" None,\n",
" 'part0102.html#x9781440630514_EPUB-2',\n",
" 'part0103.html#x9781440630514_EPUB-3',\n",
" 'part0104.html#x9781440630514_EPUB-4',\n",
" 'part0106.html#x9781440630514_EPUB-5',\n",
" 'part0107.html#x9781440630514_EPUB-6',\n",
" 'part0108.html#x9781440630514_EPUB-7',\n",
" 'part0109.html#x9781440630514_EPUB-8',\n",
" 'part0110.html#x9781440630514_EPUB-9',\n",
" 'part0111.html#x9781440630514_EPUB-10',\n",
" 'part0112.html#x9781440630514_EPUB-11',\n",
" 'part0113.html#x9781440630514_EPUB-12',\n",
" 'part0114.html#x9781440630514_EPUB-13',\n",
" 'part0115.html#x9781440630514_EPUB-14',\n",
" 'part0116.html#x9781440630514_EPUB-15',\n",
" 'part0117.html#x9781440630514_EPUB-16',\n",
" 'part0118.html#x9781440630514_EPUB-17',\n",
" 'part0119.html#x9781440630514_EPUB-18',\n",
" 'part0120.html#x9781440630514_EPUB-19',\n",
" 'part0121.html#x9781440630514_EPUB-20',\n",
" 'part0122.html#x9781440630514_EPUB-21',\n",
" 'part0123.html#x9781440630514_EPUB-22',\n",
" 'part0124.html#x9781440630514_EPUB-23',\n",
" 'part0125.html#x9781440630514_EPUB-24',\n",
" 'part0126.html#x9781440630514_EPUB-25',\n",
" 'part0127.html#x9781440630514_EPUB-26',\n",
" 'part0128.html#x9781440630514_EPUB-27',\n",
" 'part0129.html#x9781440630514_EPUB-28',\n",
" 'part0130.html#x9781440630514_EPUB-29',\n",
" 'part0131.html#x9781440630514_EPUB-30',\n",
" 'part0132.html#x9781440630514_EPUB-31',\n",
" 'part0133.html#x9781440630514_EPUB-32',\n",
" 'part0134.html#x9781440630514_EPUB-33',\n",
" 'part0135.html#x9781440630514_EPUB-34',\n",
" 'part0136.html#x9781440630514_EPUB-35',\n",
" 'part0137.html#x9781440630514_EPUB-36',\n",
" 'part0138.html#x9781440630514_EPUB-37',\n",
" 'part0139.html#x9781440630514_EPUB-38',\n",
" 'part0140.html#x9781440630514_EPUB-39',\n",
" 'part0141.html#x9781440630514_EPUB-40',\n",
" 'part0142.html#x9781440630514_EPUB-41',\n",
" 'part0143.html#x9781440630514_EPUB-42',\n",
" 'part0144.html#x9781440630514_EPUB-43',\n",
" 'part0145.html#x9781440630514_EPUB-44',\n",
" 'part0146.html#x9781440630514_EPUB-45',\n",
" 'part0147.html#x9781440630514_EPUB-46',\n",
" 'part0148.html#x9781440630514_EPUB-47',\n",
" 'part0149.html#x9781440630514_EPUB-48',\n",
" 'part0150.html#x9781440630514_EPUB-49',\n",
" 'part0151.html#x9781440630514_EPUB-50',\n",
" 'part0152.html#x9781440630514_EPUB-51',\n",
" 'part0153.html#x9781440630514_EPUB-52',\n",
" 'part0154.html#x9781440630514_EPUB-53',\n",
" 'part0155.html#x9781440630514_EPUB-54',\n",
" 'part0156.html#x9781440630514_EPUB-55',\n",
" 'part0157.html#x9781440630514_EPUB-56',\n",
" 'part0158.html#x9781440630514_EPUB-57',\n",
" 'part0159.html#x9781440630514_EPUB-58',\n",
" 'part0160.html#x9781440630514_EPUB-59',\n",
" 'part0161.html#x9781440630514_EPUB-60',\n",
" 'part0162.html#x9781440630514_EPUB-61',\n",
" 'part0163.html#x9781440630514_EPUB-62',\n",
" 'part0164.html#x9781440630514_EPUB-63',\n",
" 'part0165.html#x9781440630514_EPUB-64',\n",
" 'part0166.html#x9781440630514_EPUB-65',\n",
" 'part0167.html#x9781440630514_EPUB-66',\n",
" 'part0168.html#x9781440630514_EPUB-67',\n",
" 'part0169.html#x9781440630514_EPUB-68',\n",
" 'part0170.html#x9781440630514_EPUB-69',\n",
" 'part0171.html#x9781440630514_EPUB-70',\n",
" None,\n",
" None,\n",
" 'part0173.html#x9781440631979_EPUB-2',\n",
" 'part0174.html#x9781440631979_EPUB-3',\n",
" 'part0175.html#x9781440631979_EPUB-4',\n",
" 'part0177.html#x9781440631979_EPUB-5',\n",
" 'part0179.html#x9781440631979_EPUB-7',\n",
" 'part0182.html#x9781440631979_EPUB-10',\n",
" 'part0183.html#x9781440631979_EPUB-11',\n",
" 'part0184.html#x9781440631979_EPUB-12',\n",
" 'part0185.html#x9781440631979_EPUB-13',\n",
" 'part0186.html#x9781440631979_EPUB-14',\n",
" 'part0187_split_000.html#x9781440631979_EPUB-15',\n",
" 'part0188.html#x9781440631979_EPUB-16',\n",
" 'part0189_split_000.html#x9781440631979_EPUB-17',\n",
" 'part0190.html#x9781440631979_EPUB-18',\n",
" 'part0191.html#x9781440631979_EPUB-19',\n",
" 'part0192.html#x9781440631979_EPUB-20',\n",
" 'part0193.html#x9781440631979_EPUB-21',\n",
" 'part0194.html#x9781440631979_EPUB-22',\n",
" 'part0195.html#x9781440631979_EPUB-23',\n",
" 'part0196_split_000.html#x9781440631979_EPUB-24',\n",
" 'part0197.html#x9781440631979_EPUB-25',\n",
" 'part0198.html#x9781440631979_EPUB-26',\n",
" 'part0199.html#x9781440631979_EPUB-27',\n",
" 'part0200.html#x9781440631979_EPUB-28',\n",
" 'part0201.html#x9781440631979_EPUB-29',\n",
" 'part0202.html#x9781440631979_EPUB-30',\n",
" 'part0203.html#x9781440631979_EPUB-31',\n",
" 'part0204.html#x9781440631979_EPUB-32',\n",
" 'part0205.html#x9781440631979_EPUB-33',\n",
" 'part0206.html#x9781440631979_EPUB-34',\n",
" 'part0207.html#x9781440631979_EPUB-35',\n",
" 'part0208.html#x9781440631979_EPUB-36',\n",
" 'part0209.html#x9781440631979_EPUB-37',\n",
" 'part0210.html#x9781440631979_EPUB-38',\n",
" 'part0211.html#x9781440631979_EPUB-39',\n",
" 'part0212.html#x9781440631979_EPUB-40',\n",
" 'part0213.html#x9781440631979_EPUB-41',\n",
" 'part0214.html#x9781440631979_EPUB-42',\n",
" 'part0215.html#x9781440631979_EPUB-43',\n",
" 'part0216.html#x9781440631979_EPUB-44',\n",
" 'part0217.html#x9781440631979_EPUB-45',\n",
" 'part0218.html#x9781440631979_EPUB-46',\n",
" 'part0219.html#x9781440631979_EPUB-47',\n",
" 'part0220.html#x9781440631979_EPUB-48',\n",
" 'part0221.html#x9781440631979_EPUB-49',\n",
" 'part0222.html#x9781440631979_EPUB-50',\n",
" 'part0223.html#x9781440631979_EPUB-51',\n",
" 'part0224.html#x9781440631979_EPUB-52',\n",
" 'part0225.html#x9781440631979_EPUB-53',\n",
" 'part0226.html#x9781440631979_EPUB-54',\n",
" 'part0227.html#x9781440631979_EPUB-55',\n",
" 'part0228.html#x9781440631979_EPUB-56',\n",
" 'part0229.html#x9781440631979_EPUB-57',\n",
" 'part0230.html#x9781440631979_EPUB-58',\n",
" None,\n",
" None,\n",
" 'part0233.html#x9781440619649_EPUB-2',\n",
" 'part0234.html#x9781440619649_EPUB-3',\n",
" 'part0236.html#x9781440619649_EPUB-4',\n",
" 'part0237.html#x9781440619649_EPUB-5',\n",
" 'part0238.html#x9781440619649_EPUB-6',\n",
" 'part0239.html#x9781440619649_EPUB-7',\n",
" 'part0240.html#x9781440619649_EPUB-8',\n",
" 'part0241.html#x9781440619649_EPUB-9',\n",
" 'part0242.html#x9781440619649_EPUB-10',\n",
" 'part0243.html#x9781440619649_EPUB-11',\n",
" 'part0244.html#x9781440619649_EPUB-12',\n",
" 'part0245.html#x9781440619649_EPUB-13',\n",
" 'part0246.html#x9781440619649_EPUB-14',\n",
" 'part0247.html#x9781440619649_EPUB-15',\n",
" 'part0248.html#x9781440619649_EPUB-16',\n",
" 'part0249.html#x9781440619649_EPUB-17',\n",
" 'part0250.html#x9781440619649_EPUB-18',\n",
" 'part0251.html#x9781440619649_EPUB-19',\n",
" 'part0252.html#x9781440619649_EPUB-20',\n",
" 'part0253.html#x9781440619649_EPUB-21',\n",
" 'part0254.html#x9781440619649_EPUB-22',\n",
" 'part0255.html#x9781440619649_EPUB-23',\n",
" 'part0256.html#x9781440619649_EPUB-24',\n",
" 'part0257.html#x9781440619649_EPUB-25',\n",
" 'part0258.html#x9781440619649_EPUB-26',\n",
" 'part0259.html#x9781440619649_EPUB-27',\n",
" 'part0260.html#x9781440619649_EPUB-28',\n",
" 'part0261.html#x9781440619649_EPUB-29',\n",
" 'part0262.html#x9781440619649_EPUB-30',\n",
" 'part0263.html#x9781440619649_EPUB-31',\n",
" 'part0264.html#x9781440619649_EPUB-32',\n",
" 'part0265.html#x9781440619649_EPUB-33',\n",
" 'part0266.html#x9781440619649_EPUB-34',\n",
" 'part0267.html#x9781440619649_EPUB-35',\n",
" 'part0268.html#x9781440619649_EPUB-36',\n",
" 'part0269.html#x9781440619649_EPUB-37',\n",
" 'part0270.html#x9781440619649_EPUB-38',\n",
" 'part0271.html#x9781440619649_EPUB-39',\n",
" 'part0272.html#x9781440619649_EPUB-40',\n",
" 'part0273.html#x9781440619649_EPUB-41',\n",
" 'part0274.html#x9781440619649_EPUB-42',\n",
" 'part0275.html#x9781440619649_EPUB-43',\n",
" 'part0276.html#x9781440619649_EPUB-44',\n",
" 'part0277.html#x9781440619649_EPUB-45',\n",
" 'part0278.html#x9781440619649_EPUB-46',\n",
" 'part0279.html#x9781440619649_EPUB-47',\n",
" 'part0280.html#x9781440619649_EPUB-48',\n",
" 'part0281.html#x9781440619649_EPUB-49',\n",
" 'part0282.html#x9781440619649_EPUB-50',\n",
" 'part0283.html#x9781440619649_EPUB-51',\n",
" None,\n",
" None,\n",
" 'part0285.html#x9781440619236_EPUB-1',\n",
" 'part0286.html#x9781440619236_EPUB-2',\n",
" 'part0288.html#x9781440619236_EPUB-3',\n",
" 'part0289.html#x9781440619236_EPUB-4',\n",
" 'part0290.html#x9781440619236_EPUB-5',\n",
" 'part0291.html#x9781440619236_EPUB-6',\n",
" 'part0292.html#x9781440619236_EPUB-7',\n",
" 'part0293.html#x9781440619236_EPUB-8',\n",
" 'part0294.html#x9781440619236_EPUB-9',\n",
" 'part0295.html#x9781440619236_EPUB-10',\n",
" 'part0296.html#x9781440619236_EPUB-11',\n",
" 'part0297.html#x9781440619236_EPUB-12',\n",
" 'part0298.html#x9781440619236_EPUB-13',\n",
" 'part0299.html#x9781440619236_EPUB-14',\n",
" 'part0300.html#x9781440619236_EPUB-15',\n",
" 'part0301.html#x9781440619236_EPUB-16',\n",
" 'part0302.html#x9781440619236_EPUB-17',\n",
" 'part0303.html#x9781440619236_EPUB-18',\n",
" 'part0304.html#x9781440619236_EPUB-19',\n",
" 'part0305.html#x9781440619236_EPUB-20',\n",
" 'part0306.html#x9781440619236_EPUB-21',\n",
" 'part0307.html#x9781440619236_EPUB-22',\n",
" 'part0308.html#x9781440619236_EPUB-23',\n",
" 'part0309.html#x9781440619236_EPUB-24',\n",
" 'part0310.html#x9781440619236_EPUB-25',\n",
" 'part0311.html#x9781440619236_EPUB-26',\n",
" 'part0312.html#x9781440619236_EPUB-27',\n",
" 'part0313.html#x9781440619236_EPUB-28',\n",
" 'part0314.html#x9781440619236_EPUB-29',\n",
" 'part0315.html#x9781440619236_EPUB-30',\n",
" 'part0316.html#x9781440619236_EPUB-31',\n",
" 'part0317.html#x9781440619236_EPUB-32',\n",
" 'part0318.html#x9781440619236_EPUB-33',\n",
" 'part0319.html#x9781440619236_EPUB-34',\n",
" 'part0320.html#x9781440619236_EPUB-35',\n",
" 'part0321.html#x9781440619236_EPUB-36',\n",
" 'part0322.html#x9781440619236_EPUB-37',\n",
" 'part0323.html#x9781440619236_EPUB-38',\n",
" 'part0324.html#x9781440619236_EPUB-39',\n",
" 'part0325.html#x9781440619236_EPUB-40',\n",
" 'part0326.html#x9781440619236_EPUB-41',\n",
" 'part0327.html#x9781440619236_EPUB-42',\n",
" 'part0328.html#x9781440619236_EPUB-43',\n",
" 'part0329.html#x9781440619236_EPUB-44',\n",
" 'part0330.html#x9781440619236_EPUB-45',\n",
" 'part0331.html#x9781440619236_EPUB-46',\n",
" 'part0332.html#x9781440619236_EPUB-47',\n",
" 'part0333.html#x9781440619236_EPUB-48',\n",
" 'part0334.html#x9781440619236_EPUB-49',\n",
" 'part0335.html#x9781440619236_EPUB-50',\n",
" 'part0336.html#x9781440619236_EPUB-51',\n",
" 'part0337.html#x9781440619236_EPUB-52',\n",
" 'part0338.html#A2AU40-9d088d4ffd7e4135ab987c63f18099d5',\n",
" 'http://links.penguinrandomhouse.com/type/prhebooklanding/isbn/9780593333020/display/1',\n",
" 'http://links.penguinrandomhouse.com/type/prhebooklanding/isbn/9780593333020/display/2']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"href_list"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"we see that valid hrefs begin with either 'text' or 'part'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# start to end processing"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"output_doc = BeautifulSoup()\n",
"output_doc.append(output_doc.new_tag(\"html\"))\n",
"output_doc.html.append(output_doc.new_tag(\"body\"))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Create a new head tag\n",
"head_tag = output_doc.new_tag('head')\n",
"# Create a link tag for the stylesheet\n",
"link_tag = output_doc.new_tag('link', rel='stylesheet', type='text/css', href='style.css')\n",
"# Append the link tag to the head tag\n",
"head_tag.append(link_tag)\n",
"# Insert the head tag into the HTML document\n",
"if output_doc.head:\n",
" output_doc.head.insert_before(head_tag)\n",
"else:\n",
" output_doc.insert(0, head_tag)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"for file in html_files:\n",
" with open(file, 'r') as html_file:\n",
" soup = BeautifulSoup(html_file, \"html.parser\")\n",
" body_content = soup.find('body')\n",
" # the trick to preserve the id is to hide the body in a div with the same id\n",
" div = output_doc.new_tag('div', id=body_content.get('id'))\n",
" div.append(body_content.extract())\n",
" output_doc.append(div)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# strip the .html file name and preserve the id string, prepend with # to tell it is a href\n",
"for link in output_doc.find_all('a'):\n",
" href = link.get('href')\n",
" if href and href.startswith('text'):\n",
" # Update the link to point to the correct section within the merged document\n",
" index = href.find('#')\n",
" link['href'] = f'{href[index:]}'\n",
"\n",
" if href and href.startswith('part'):\n",
" # Update the link to point to the correct section within the merged document\n",
" index = href.find('#')\n",
" link['href'] = f'{href[index:]}'"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"with open(\"output.html\", \"w\", encoding='utf-8') as file:\n",
" file.write(str(output_doc.prettify()))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}