Source code for textract.parsers.pdf_parser
from ..shell import run
from ..exceptions import UnknownMethod, ShellError
[docs]def extract(filename, method='', **kwargs):
"""Extract text from pdf files using ``method``.
"""
if method == '' or method == 'pdftotext':
try:
return extract_pdftotext(filename)
except ShellError, e:
# if pdftotext isn't installed and the pdftotext method
# wasn't specified, then gracefully fallback to using
# pdfminer instead
if method == '' and e.is_uninstalled():
return extract_pdfminer(filename)
else:
raise e
elif method == 'pdfminer':
return extract_pdfminer(filename)
else:
raise UnknownMethod(method)
[docs]def extract_pdftotext(filename):
"""Extract text from pdfs using the pdftotext command line utility."""
stdout, stderr = run('pdftotext %(filename)s -' % locals())
return stdout
[docs]def extract_pdfminer(filename):
"""Extract text from pdfs using pdfminer."""
stdout, stderr = run('pdf2txt.py %(filename)s' % locals())
return stdout