import json
from .utils import BaseParser
[docs]class Parser(BaseParser):
"""Extract all of the string values of a json file (no keys as those
are, in some sense, markup). This is useful for parsing content
from mongodb dumps, for example.
"""
[docs] def get_text(self, deserialized_json):
"""Recursively get text from subcomponents of a deserialized json. To
enforce the same order on the documents, make sure to read keys of
deserialized_json in a consistent (alphabetical) order.
"""
if isinstance(deserialized_json, dict):
result = ''
for key in sorted(deserialized_json):
result += self.get_text(deserialized_json[key]) + ' '
return result
if isinstance(deserialized_json, list):
result = ''
for item in deserialized_json:
result += self.get_text(item) + ' '
return result
if isinstance(deserialized_json, basestring):
return deserialized_json
else:
return ''