# process epub htmls into a single html 

Goal: I want to merge html files from an epub into a single html document. I also want working links, so I need to ensure that the hrefs and relevant tag id's are defined properly to work within the document

In [1]:
import os
import glob
from bs4 import BeautifulSoup

# investigate each one html document links and hrefs

I have already unzipped an epub document into the folder `epub` in the same directory

In [2]:
# collect all html files in outer and inner directories
directory = './epub'
outer_html_files = glob.glob(os.path.join(directory, '*.xhtml'))

inner_directory = './epub/text'
inner_html_files = glob.glob(os.path.join(inner_directory, '*.html'))

# the ordering depends heavily on what you have
outer_html_files = outer_html_files[::-1]
inner_html_files = sorted(inner_html_files)
html_files = outer_html_files + inner_html_files

In [3]:
# let us look at a sample html
with open(html_files[2], 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

In [4]:
# get all tags
soup.find_all(id=True)

[<body class="calibre" id="0-9d088d4ffd7e4135ab987c63f18099d5">
 <div class="titlehead">Frank Herbert's Dune Saga Collection: Books 1 - 6</div>
 <div class="titletext">Dune</div>
 <div class="titletext">Dune Messiah</div>
 <div class="titletext">Children of Dune</div>
 <div class="titletext">God Emperor of Dune</div>
 <div class="titletext">Heretics of Dune</div>
 <div class="titletext">Chapterhouse: Dune</div>
 <div class="titleauthor">Frank Herbert</div>
 </body>]

In [5]:
# get the id of the body
soup.find('body').get('id')

'0-9d088d4ffd7e4135ab987c63f18099d5'

In [6]:
output_doc = BeautifulSoup()
output_doc.append(output_doc.new_tag("html"))
output_doc.html.append(output_doc.new_tag("body"))

Problem: when just getting appending the body, all the body tags get consolidated into one large outer body tag, so the id's disappear.

The solution is to create a new div for each html file, then append the body to the div and then append the div into the output document

In [7]:
# read 2 files
for file in html_files[3:4+1]:
    with open(html_files[3], 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        body_content = soup.find('body')
        # the trick to preserve the body tag is to hide the body in a div
        div = output_doc.new_tag('div', id=(body_content.get('id')))
        div.append(body_content.extract())
        output_doc.append(div)



In [8]:
print(output_doc.prettify())

<html>
 <body>
 </body>
</html>
<div id="UGI0-9d088d4ffd7e4135ab987c63f18099d5">
 <body class="calibre" id="UGI0-9d088d4ffd7e4135ab987c63f18099d5">
  <h1 class="contents-head" id="calibre_pb_0">
   Table of Contents
  </h1>
  <p class="contents-fl">
   <i class="calibre1">
    <a class="calibre2" href="kindle:embed:0001?mime=image/jpg">
     Cover
    </a>
   </i>
  </p>
  <p class="contents-fl">
   <i class="calibre1">
    <a class="calibre2" href="part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5">
     Title Page
    </a>
   </i>
  </p>
  <p class="contents-fl">
  </p>
  <div class="contents-fl">
   <a class="calibre2" href="part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5">
    Dune
   </a>
  </div>
  <div class="contents-fl">
   <a class="calibre2" href="part0069.html#21PMQ0-9d088d4ffd7e4135ab987c63f18099d5">
    Dune Messiah
   </a>
  </div>
  <div class="contents-fl">
   <a class="calibre2" href="part0101.html#30A8Q0-9d088d4ffd7e4135ab987c63f18099d5">
    Children of Dune
   

Notice how the `body` is preserved from each html file

The available id with the original id name allows hrefs to reach this section of the document.

However, notice that the hrefs have the name "partX.html#id".

Let's try to find all the href variants

In [9]:
output_doc = BeautifulSoup()
output_doc.append(output_doc.new_tag("html"))
output_doc.html.append(output_doc.new_tag("body"))

In [10]:
for file in html_files:
    with open(file, 'r') as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
        body_content = soup.find('body')
        # the trick to preserve the id is to hide the body in a div with the same id
        div = output_doc.new_tag('div', id=body_content.get('id'))
        div.append(body_content.extract())
        output_doc.append(div)

In [11]:
href_list = [ link.get('href') for link in output_doc.find_all('a')]

In [12]:
href_list

['text/part0000.html#0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0001.html#UGI0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0002.html#x9781101157879_EPUB',
 'text/part0003_split_000.html#x9781101157879_EPUB-1',
 'text/part0004.html#3Q280-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0005.html#4OIQ0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0006.html#5N3C0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0007.html#6LJU0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0008.html#7K4G0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0009.html#8IL20-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0010.html#9H5K0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0011.html#AFM60-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0012.html#BE6O0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0013.html#CCNA0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0014.html#DB7S0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0015.html#E9OE0-9d088d4ffd7e4135ab987c63f18099d5',
 'text/part0016.html#F8900-9d088d4ffd7e

we see that valid hrefs begin with either 'text' or 'part'

# start to end processing

In [13]:
output_doc = BeautifulSoup()
output_doc.append(output_doc.new_tag("html"))
output_doc.html.append(output_doc.new_tag("body"))

In [14]:
# Create a new head tag
head_tag = output_doc.new_tag('head')
# Create a link tag for the stylesheet
link_tag = output_doc.new_tag('link', rel='stylesheet', type='text/css', href='style.css')
# Append the link tag to the head tag
head_tag.append(link_tag)
# Insert the head tag into the HTML document
if output_doc.head:
    output_doc.head.insert_before(head_tag)
else:
    output_doc.insert(0, head_tag)


In [15]:
for file in html_files:
    with open(file, 'r') as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
        body_content = soup.find('body')
        # the trick to preserve the id is to hide the body in a div with the same id
        div = output_doc.new_tag('div', id=body_content.get('id'))
        div.append(body_content.extract())
        output_doc.append(div)

In [16]:
# strip the .html file name and preserve the id string, prepend with # to tell it is a href
for link in output_doc.find_all('a'):
    href = link.get('href')
    if href and href.startswith('text'):
        # Update the link to point to the correct section within the merged document
        index = href.find('#')
        link['href'] = f'{href[index:]}'

    if href and href.startswith('part'):
        # Update the link to point to the correct section within the merged document
        index = href.find('#')
        link['href'] = f'{href[index:]}'

In [17]:
with open("output.html", "w", encoding='utf-8') as file:
    file.write(str(output_doc.prettify()))