Source code for textract.parsers.utils

"""This module includes a bunch of convenient base classes that are
reused in many of the other parser modules.
"""

from __future__ import annotations

import errno
import os
import subprocess
import tempfile

import chardet

from textract import exceptions



[docs]
class BaseParser:
    """The :class:`.BaseParser` abstracts out some common functionality
    that is used across all document Parsers. In particular, it has
    the responsibility of handling all unicode and byte-encoding.
    """


[docs]
    def extract(self, filename, **kwargs) -> bytes | str:
        """This method must be overwritten by child classes to extract raw
        text from a filename. This method can return either a
        byte-encoded string or unicode.
        """
        raise NotImplementedError("must be overwritten by child classes")



[docs]
    def encode(self, text, encoding):
        """Encode the ``text`` in ``encoding`` byte-encoding. This ignores
        code points that can't be encoded in byte-strings.
        """
        return text.encode(encoding, "ignore")



[docs]
    def process(self, filename, input_encoding, output_encoding="utf8", **kwargs):
        """Process ``filename`` and encode byte-string with ``encoding``. This
        method is called by :func:`textract.parsers.process` and wraps
        the :meth:`.BaseParser.extract` method in `a delicious unicode
        sandwich <http://nedbatchelder.com/text/unipain.html>`_.

        """
        # make a "unicode sandwich" to handle dealing with unknown
        # input byte strings and converting them to a predictable
        # output encoding
        # http://nedbatchelder.com/text/unipain/unipain.html#35
        byte_string = self.extract(filename, **kwargs)
        unicode_string = self.decode(byte_string, input_encoding)
        return self.encode(unicode_string, output_encoding)



[docs]
    def decode(self, text, input_encoding=None):
        """Decode ``text`` using the `chardet
        <https://github.com/chardet/chardet>`_ package.
        """
        # only decode byte strings into unicode if it hasn't already
        # been done by a subclass
        if isinstance(text, str):
            return text

        # empty text? nothing to decode
        if not text:
            return ""

        # use the provided encoding
        if input_encoding:
            return text.decode(input_encoding)

        # use chardet to automatically detect the encoding text if no encoding is provided
        result = chardet.detect(text)
        encoding = result["encoding"] if result["confidence"] > 0.80 else "utf8"
        return text.decode(encoding, errors="replace")





[docs]
class ShellParser(BaseParser):
    """The :class:`.ShellParser` extends the :class:`.BaseParser` to make
    it easy to run external programs from the command line with
    `Fabric <http://www.fabfile.org/>`_-like behavior.
    """


[docs]
    def run(self, args):
        """Run ``command`` and return the subsequent ``stdout`` and ``stderr``
        as a tuple. If the command is not successful, this raises a
        :exc:`textract.exceptions.ShellError`.
        """

        # run a subprocess and put the stdout and stderr on the pipe object
        try:
            pipe = subprocess.Popen(
                args,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
        except OSError as e:
            if e.errno == errno.ENOENT:
                # File not found.
                # This is equivalent to getting exitcode 127 from sh
                raise exceptions.ShellError(
                    " ".join(args),
                    127,
                    b"",
                    b"",
                )
            raise  # Reraise the last exception unmodified

        # pipe.wait() ends up hanging on large files. using
        # pipe.communicate appears to avoid this issue
        stdout, stderr = pipe.communicate()

        # if pipe is busted, raise an error (unlike Fabric)
        if pipe.returncode != 0:
            raise exceptions.ShellError(
                " ".join(args),
                pipe.returncode,
                stdout,
                stderr,
            )

        return stdout, stderr



[docs]
    def temp_filename(self):
        """Return a unique tempfile name."""
        # TODO: it would be nice to get this to behave more like a
        # context so we can make sure these temporary files are
        # removed, regardless of whether an error occurs or the
        # program is terminated.
        handle, filename = tempfile.mkstemp()
        os.close(handle)
        return filename