Source code for textract.parsers.html_parser
import re
from bs4 import BeautifulSoup
from .utils import BaseParser
[docs]class Parser(BaseParser):
"""Extract text from html file using beautifulsoup4. Filter text to
only show the visible parts of the page. Insipration from `here
<http://stackoverflow.com/a/1983219/564709>`_.
"""
_disallowed_parents = set([
'style',
'script',
'[document]',
'head',
'title',
])
def _visible(self, element):
"""Used to filter text elements that have invisible text on the page.
"""
if element.parent.name in self._disallowed_parents:
return False
elif re.match(u'<!--.*-->', element):
return False
return True
[docs] def extract(self, filename, **kwargs):
with open(filename) as stream:
soup = BeautifulSoup(stream)
# soup.get_text method is nice, but it also returns all the
# embedded javascript which isn't terribly useful for this use
# case. inspiration from http://stackoverflow.com/a/1983219/564709
texts = soup.find_all(text=True)
texts = [text for text in filter(self._visible, texts)]
return '\n\n'.join(texts)