Source code for textract.parsers.pdf_parser

from ..shell import run
from ..exceptions import UnknownMethod, ShellError


[docs]def extract(filename, method='', **kwargs): """Extract text from pdf files using ``method``. """ if method == '' or method == 'pdftotext': try: return extract_pdftotext(filename) except ShellError, e: # if pdftotext isn't installed and the pdftotext method # wasn't specified, then gracefully fallback to using # pdfminer instead if method == '' and e.is_uninstalled(): return extract_pdfminer(filename) else: raise e elif method == 'pdfminer': return extract_pdfminer(filename) else: raise UnknownMethod(method)
[docs]def extract_pdftotext(filename): """Extract text from pdfs using the pdftotext command line utility.""" stdout, stderr = run('pdftotext %(filename)s -' % locals()) return stdout
[docs]def extract_pdfminer(filename): """Extract text from pdfs using pdfminer.""" stdout, stderr = run('pdf2txt.py %(filename)s' % locals()) return stdout