Source code for textract.parsers.utils

"""This module includes a bunch of convenient base classes that are
reused in many of the other parser modules.
"""

from __future__ import annotations

import errno
import os
import subprocess
import tempfile

import chardet

from textract import exceptions


[docs] class BaseParser: """The :class:`.BaseParser` abstracts out some common functionality that is used across all document Parsers. In particular, it has the responsibility of handling all unicode and byte-encoding. """
[docs] def extract(self, filename, **kwargs) -> bytes | str: """This method must be overwritten by child classes to extract raw text from a filename. This method can return either a byte-encoded string or unicode. """ raise NotImplementedError("must be overwritten by child classes")
[docs] def encode(self, text, encoding): """Encode the ``text`` in ``encoding`` byte-encoding. This ignores code points that can't be encoded in byte-strings. """ return text.encode(encoding, "ignore")
[docs] def process(self, filename, input_encoding, output_encoding="utf8", **kwargs): """Process ``filename`` and encode byte-string with ``encoding``. This method is called by :func:`textract.parsers.process` and wraps the :meth:`.BaseParser.extract` method in `a delicious unicode sandwich <http://nedbatchelder.com/text/unipain.html>`_. """ # make a "unicode sandwich" to handle dealing with unknown # input byte strings and converting them to a predictable # output encoding # http://nedbatchelder.com/text/unipain/unipain.html#35 byte_string = self.extract(filename, **kwargs) unicode_string = self.decode(byte_string, input_encoding) return self.encode(unicode_string, output_encoding)
[docs] def decode(self, text, input_encoding=None): """Decode ``text`` using the `chardet <https://github.com/chardet/chardet>`_ package. """ # only decode byte strings into unicode if it hasn't already # been done by a subclass if isinstance(text, str): return text # empty text? nothing to decode if not text: return "" # use the provided encoding if input_encoding: return text.decode(input_encoding) # use chardet to automatically detect the encoding text if no encoding is provided result = chardet.detect(text) encoding = result["encoding"] if result["confidence"] > 0.80 else "utf8" return text.decode(encoding, errors="replace")
[docs] class ShellParser(BaseParser): """The :class:`.ShellParser` extends the :class:`.BaseParser` to make it easy to run external programs from the command line with `Fabric <http://www.fabfile.org/>`_-like behavior. """
[docs] def run(self, args): """Run ``command`` and return the subsequent ``stdout`` and ``stderr`` as a tuple. If the command is not successful, this raises a :exc:`textract.exceptions.ShellError`. """ # run a subprocess and put the stdout and stderr on the pipe object try: pipe = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) except OSError as e: if e.errno == errno.ENOENT: # File not found. # This is equivalent to getting exitcode 127 from sh raise exceptions.ShellError( " ".join(args), 127, b"", b"", ) raise # Reraise the last exception unmodified # pipe.wait() ends up hanging on large files. using # pipe.communicate appears to avoid this issue stdout, stderr = pipe.communicate() # if pipe is busted, raise an error (unlike Fabric) if pipe.returncode != 0: raise exceptions.ShellError( " ".join(args), pipe.returncode, stdout, stderr, ) return stdout, stderr
[docs] def temp_filename(self): """Return a unique tempfile name.""" # TODO: it would be nice to get this to behave more like a # context so we can make sure these temporary files are # removed, regardless of whether an error occurs or the # program is terminated. handle, filename = tempfile.mkstemp() os.close(handle) return filename