Source code for textract.parsers.epub_parser

from ebooklib import epub, ITEM_DOCUMENT
from bs4 import BeautifulSoup

from .utils import BaseParser


[docs]class Parser(BaseParser): """Extract text from epub using python epub library """
[docs] def extract(self, filename, **kwargs): book = epub.read_epub(filename) result = "" for item in book.get_items(): type = item.get_type() if type == ITEM_DOCUMENT: soup = BeautifulSoup(item.content) result = result + soup.text return result