Source code for textract.parsers.json_parser

import json


[docs]def extract(filename, **kwargs):
    """Extract all of the string values of a json file (no keys as those
    are, in some sense, markup). This is useful for parsing content
    from mongodb dumps, for example.
    """
    f = open(filename, 'r')
    deserialized_json = json.load(f)
    return get_text(deserialized_json)


[docs]def get_text(deserialized_json):
    """Recursively get text from subcomponents of a deserialized json. To
    enforce the same order on the documents, make sure to read keys of
    deserialized_json in a consistent (alphabetical) order.
    """
    if isinstance(deserialized_json, dict):
        result = ''
        for key in sorted(deserialized_json):
            result += get_text(deserialized_json[key]) + ' '
        return result

    if isinstance(deserialized_json, list):
        result = ''
        for item in deserialized_json:
            result += get_text(item) + ' '
        return result

    if isinstance(deserialized_json, basestring):
        return deserialized_json
    else:
        return ''