Source code for textract.parsers.html_parser

import re

from bs4 import BeautifulSoup

from .utils import BaseParser


[docs]class Parser(BaseParser):
    """Extract text from html file using beautifulsoup4. Filter text to
    only show the visible parts of the page. Insipration from `here
    <http://stackoverflow.com/a/1983219/564709>`_.
    """

    _disallowed_parents = set([
        'style',
        'script',
        '[document]',
        'head',
        'title',
    ])

    def _visible(self, element):
        """Used to filter text elements that have invisible text on the page.
        """
        if element.parent.name in self._disallowed_parents:
            return False
        elif re.match(u'<!--.*-->', element):
            return False
        return True

[docs]    def extract(self, filename, **kwargs):
        with open(filename) as stream:
            soup = BeautifulSoup(stream)

        # soup.get_text method is nice, but it also returns all the
        # embedded javascript which isn't terribly useful for this use
        # case. inspiration from http://stackoverflow.com/a/1983219/564709
        texts = soup.find_all(text=True)
        texts = [text for text in filter(self._visible, texts)]
        return '\n\n'.join(texts)