Source code for textract.parsers.epub_parser
from ebooklib import epub, ITEM_DOCUMENT
from bs4 import BeautifulSoup
from .utils import BaseParser
[docs]class Parser(BaseParser):
"""Extract text from epub using python epub library
"""
[docs] def extract(self, filename, **kwargs):
book = epub.read_epub(filename)
result = ""
for item in book.get_items():
type = item.get_type()
if type == ITEM_DOCUMENT:
soup = BeautifulSoup(item.content)
result = result + soup.text
return result