Source code for textract.parsers.html_parser

import re

from bs4 import BeautifulSoup

disallowed_parents = set([
    'style',
    'script',
    '[document]',
    'head',
    'title',
])


def _visible(element):
    """Used to filter text elements that have invisible text on the page."""
    if element.parent.name in disallowed_parents:
        return False
    elif re.match(u'<!--.*-->', element):
        return False
    return True


[docs]def extract(filename, **kwargs):
    """Extract text from html file using beautifulsoup4. Filter text to
    only show the visible parts of the page. Insipration from `here
    <http://stackoverflow.com/a/1983219/564709>`_.
    """
    with open(filename) as stream:
        soup = BeautifulSoup(stream)

    # soup.get_text method is nice, but it also returns all the
    # embedded javascript which isn't terribly useful for this use
    # case. inspiration from http://stackoverflow.com/a/1983219/564709
    texts = soup.find_all(text=True)
    texts = [text.encode('utf-8') for text in filter(_visible, texts)]
    return '\n\n'.join(texts)