Source code for textract.parsers.epub_parser

from ebooklib import epub, ITEM_DOCUMENT
from bs4 import BeautifulSoup

from .utils import BaseParser


[docs]class Parser(BaseParser): """Extract text from epub using python epub library """
[docs] def extract(self, filename, **kwargs): book = epub.read_epub(filename) result = '' for id, _ in book.spine: item = book.get_item_with_id(id) soup = BeautifulSoup(item.content, 'lxml') for child in soup.find_all( ['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4'] ): result = result + child.text + '\n' return result