Source code for textract.parsers.pdf_parser
from ..exceptions import UnknownMethod, ShellError
from .utils import ShellParser
[docs]class Parser(ShellParser):
"""Extract text from pdf files using either the ``pdftotext`` method
(default) or the ``pdfminer`` method.
"""
[docs] def extract(self, filename, method='', **kwargs):
if method == '' or method == 'pdftotext':
try:
return self.extract_pdftotext(filename)
except ShellError as ex:
# If pdftotext isn't installed and the pdftotext method
# wasn't specified, then gracefully fallback to using
# pdfminer instead.
if method == '' and ex.is_uninstalled():
return self.extract_pdfminer(filename)
else:
raise ex
elif method == 'pdfminer':
return self.extract_pdfminer(filename)
else:
raise UnknownMethod(method)
[docs] def extract_pdftotext(self, filename):
"""Extract text from pdfs using the pdftotext command line utility."""
stdout, _ = self.run('pdftotext "%(filename)s" -' % locals())
return stdout
[docs] def extract_pdfminer(self, filename):
"""Extract text from pdfs using pdfminer."""
stdout, _ = self.run('pdf2txt.py "%(filename)s"' % locals())
return stdout