Source code for textract.parsers.html_parser

import re

from bs4 import BeautifulSoup

disallowed_parents = set([
    'style',
    'script',
    '[document]',
    'head',
    'title',
])


def _visible(element):
    """Used to filter text elements that have invisible text on the page."""
    if element.parent.name in disallowed_parents:
        return False
    elif re.match(u'<!--.*-->', element):
        return False
    return True


[docs]def extract(filename, **kwargs): """Extract text from html file using beautifulsoup4. Filter text to only show the visible parts of the page. Insipration from `here <http://stackoverflow.com/a/1983219/564709>`_. """ with open(filename) as stream: soup = BeautifulSoup(stream) # soup.get_text method is nice, but it also returns all the # embedded javascript which isn't terribly useful for this use # case. inspiration from http://stackoverflow.com/a/1983219/564709 texts = soup.find_all(text=True) texts = [text.encode('utf-8') for text in filter(_visible, texts)] return '\n\n'.join(texts)