Source code for textract.parsers.epub_parser

import zipfile

from bs4 import BeautifulSoup

from .utils import BaseParser



[docs]
class Parser(BaseParser):
    """Extract text from epub"""


[docs]
    def extract(self, filename, **kwargs):
        book = zipfile.ZipFile(filename)
        result = ""
        for text_name in self.__epub_sections(book):
            if not text_name.endswith("html"):
                continue
            soup = BeautifulSoup(book.open(text_name), features="lxml")
            html_content_tags = ["title", "p", "h1", "h2", "h3", "h4"]
            for child in soup.find_all(html_content_tags):
                inner_text = child.text.strip() if child.text else ""
                if inner_text:
                    result += inner_text + "\n"
        return result


    def __epub_sections(self, book):
        opf_paths = self.__get_opf_paths(book)
        return self.__get_item_paths(book, opf_paths)

    def __get_opf_paths(self, book):
        meta_inf = book.open("META-INF/container.xml")
        meta_soup = BeautifulSoup(meta_inf, features="xml")
        if not meta_soup.rootfiles:
            return []
        return [
            f.get("full-path")  # type: ignore[attr-defined]
            for f in meta_soup.rootfiles.find_all("rootfile")
            if f.get("full-path")  # type: ignore[attr-defined]
        ]

    def __get_item_paths(self, book, opf_paths):
        item_paths = []
        for opf_path in opf_paths:
            opf_soup = BeautifulSoup(book.open(opf_path), "xml")
            if not opf_soup.spine:
                continue
            epub_items = opf_soup.spine.find_all("itemref")
            for epub_item in epub_items:
                if idref := epub_item.get("idref"):  # type: ignore[attr-defined]
                    item = self.__get_item(opf_soup, idref)
                    if item and (href := item.get("href")):  # type: ignore[attr-defined]
                        item_paths.append(self.__get_full_item_path(book, href))
        return item_paths

    def __get_item(self, opf_soup, item_id):
        if not opf_soup.manifest:
            return None
        for item in opf_soup.manifest.find_all("item"):
            if item.get("id") == item_id:  # type: ignore[attr-defined]
                return item
        return None

    def __get_full_item_path(self, book, partial_path):
        for filename in book.namelist():
            if filename.endswith(partial_path):
                return filename
        return None