Source code for textract.parsers.json_parser

import json

from .utils import BaseParser


[docs]class Parser(BaseParser): """Extract all of the string values of a json file (no keys as those are, in some sense, markup). This is useful for parsing content from mongodb dumps, for example. """
[docs] def extract(self, filename, **kwargs): with open(filename, 'r') as raw: deserialized_json = json.load(raw) return self.get_text(deserialized_json)
[docs] def get_text(self, deserialized_json): """Recursively get text from subcomponents of a deserialized json. To enforce the same order on the documents, make sure to read keys of deserialized_json in a consistent (alphabetical) order. """ if isinstance(deserialized_json, dict): result = '' for key in sorted(deserialized_json): result += self.get_text(deserialized_json[key]) + ' ' return result if isinstance(deserialized_json, list): result = '' for item in deserialized_json: result += self.get_text(item) + ' ' return result if isinstance(deserialized_json, basestring): return deserialized_json else: return ''