Source code for textract.parsers

"""
Route the request to the appropriate parser based on file type.
"""

import glob
import importlib
import re
from pathlib import Path

from textract import exceptions

# Dictionary structure for synonymous file extension types
EXTENSION_SYNONYMS = {
    ".jpeg": ".jpg",
    ".tff": ".tiff",
    ".tif": ".tiff",
    ".htm": ".html",
    "": ".txt",
    ".log": ".txt",
    ".tab": ".tsv",
}

# default encoding that is returned by the process method. specify it
# here so the default is used on both the process function and also by
# the command line interface
DEFAULT_OUTPUT_ENCODING = "utf_8"
DEFAULT_ENCODING = "utf_8"

# filename format
_FILENAME_SUFFIX = "_parser"



[docs]
def process(
    filename,
    input_encoding=None,
    output_encoding=DEFAULT_OUTPUT_ENCODING,
    extension=None,
    **kwargs,
):
    """This is the core function used for extracting text. It routes the
    ``filename`` to the appropriate parser and returns the extracted
    text as a byte-string encoded with ``encoding``.
    """

    # make sure the filename exists
    if not Path(filename).exists():
        raise exceptions.MissingFileError(filename)

    # get the filename extension, which is something like .docx for
    # example, and import the module dynamically using importlib. This
    # is a relative import so the name of the package is necessary
    # normally, file extension will be extracted from the file name
    # if the file name has no extension, then the user can pass the
    # extension as an argument
    if extension:
        ext = extension
        # check if the extension has the leading .
        if not ext.startswith("."):
            ext = "." + ext
        ext = ext.lower()
    else:
        ext = Path(filename).suffix.lower()

    # check the EXTENSION_SYNONYMS dictionary
    ext = EXTENSION_SYNONYMS.get(ext, ext)

    # to avoid conflicts with packages that are installed globally
    # (e.g. python's json module), all extension parser modules have
    # the _parser extension
    rel_module = ext + _FILENAME_SUFFIX

    # If we can't import the module, the file extension isn't currently
    # supported
    try:
        filetype_module = importlib.import_module(rel_module, "textract.parsers")
    except ImportError as err:
        raise exceptions.ExtensionNotSupported(ext) from err

    # do the extraction

    parser = filetype_module.Parser()
    return parser.process(filename, input_encoding, output_encoding, **kwargs)



def _get_available_extensions():
    """Get a list of available file extensions to make it easy for
    tab-completion and exception handling.
    """
    extensions = []

    # from filenames
    parsers_dir = Path(__file__).parent
    glob_filename = str(parsers_dir / f"*{_FILENAME_SUFFIX}.py")
    # Escape the path for regex to handle Windows backslashes and special chars
    ext_re = re.compile(
        re.escape(glob_filename).replace(re.escape("*"), r"(?P<ext>\w+)"),
    )
    for filename in glob.glob(glob_filename):
        if ext_match := ext_re.match(filename):
            ext = ext_match.groups()[0]
            extensions.extend((ext, "." + ext))

    # from relevant synonyms (don't use the '' synonym)
    for ext in EXTENSION_SYNONYMS:
        if ext:
            extensions.extend((ext, ext.replace(".", "", 1)))
    extensions.sort()
    return extensions