Source code for textract.parsers

"""
Route the request to the appropriate parser based on file type.
"""

import os
import importlib

from .. import exceptions

# Dictionary structure for synonymous file extension types
EXTENSION_SYNONYMS = {
    ".jpeg": ".jpg",
    ".htm": ".html",
}

# default encoding that is returned by the process method. specify it
# here so the default is used on both the process function and also by
# the command line interface
DEFAULT_ENCODING = 'utf_8'


[docs]def process(filename, encoding=DEFAULT_ENCODING, **kwargs): """This is the core function used for extracting text. It routes the ``filename`` to the appropriate parser and returns the extracted text as a byte-string encoded with ``encoding``. """ # make sure the filename exists if not os.path.exists(filename): raise exceptions.MissingFileError(filename) # get the filename extension, which is something like .docx for # example, and import the module dynamically using importlib. This # is a relative import so the name of the package is necessary _, ext = os.path.splitext(filename) ext = ext.lower() # check the EXTENSION_SYNONYMS dictionary ext = EXTENSION_SYNONYMS.get(ext, ext) # to avoid conflicts with packages that are installed globally # (e.g. python's json module), all extension parser modules have # the _parser extension rel_module = ext + '_parser' module_name = rel_module[1:] # if this module name doesn't exist in this directory it isn't # currently supported this_dir = os.path.dirname(os.path.abspath(__file__)) if not os.path.exists(os.path.join(this_dir, module_name + '.py')): raise exceptions.ExtensionNotSupported(ext) # do the extraction filetype_module = importlib.import_module(rel_module, 'textract.parsers') parser = filetype_module.Parser() return parser.process(filename, encoding, **kwargs)