Source code for textract.parsers.epub_parser
from ebooklib import epub, ITEM_DOCUMENT
from bs4 import BeautifulSoup
from .utils import BaseParser
[docs]class Parser(BaseParser):
"""Extract text from epub using python epub library
"""
[docs] def extract(self, filename, **kwargs):
book = epub.read_epub(filename)
result = ''
for id, _ in book.spine:
item = book.get_item_with_id(id)
soup = BeautifulSoup(item.content, 'lxml')
for child in soup.find_all(
['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']
):
result = result + child.text + '\n'
return result