Source code for textract.parsers.pdf_parser

from ..exceptions import UnknownMethod, ShellError

from .utils import ShellParser


[docs]class Parser(ShellParser):
    """Extract text from pdf files using either the ``pdftotext`` method
    (default) or the ``pdfminer`` method.
    """

[docs]    def extract(self, filename, method='', **kwargs):
        if method == '' or method == 'pdftotext':
            try:
                return self.extract_pdftotext(filename)
            except ShellError as ex:
                # If pdftotext isn't installed and the pdftotext method
                # wasn't specified, then gracefully fallback to using
                # pdfminer instead.
                if method == '' and ex.is_uninstalled():
                    return self.extract_pdfminer(filename)
                else:
                    raise ex

        elif method == 'pdfminer':
            return self.extract_pdfminer(filename)
        else:
            raise UnknownMethod(method)

[docs]    def extract_pdftotext(self, filename):
        """Extract text from pdfs using the pdftotext command line utility."""
        stdout, _ = self.run('pdftotext "%(filename)s" -' % locals())
        return stdout

[docs]    def extract_pdfminer(self, filename):
        """Extract text from pdfs using pdfminer."""
        stdout, _ = self.run('pdf2txt.py "%(filename)s"' % locals())
        return stdout