Source code for textract.parsers.odt_parser

import zipfile
import xml.dom.minidom
import StringIO

from .utils import BaseParser

[docs]class Parser(BaseParser): """Extract text from open document files. """
[docs] def extract(self, filename, **kwargs): # Inspiration from # with open(filename) as stream: zip_stream = zipfile.ZipFile(stream) self.content = xml.dom.minidom.parseString("content.xml") ) return self.to_string()
[docs] def to_string(self): """ Converts the document to a string. """ buff = u"" for val in ["text:p", "text:h", "text:list"]: for paragraph in self.content.getElementsByTagName(val): buff += self.text_to_string(paragraph) + "\n" return buff
[docs] def text_to_string(self, element): buff = u"" for node in element.childNodes: if node.nodeType == xml.dom.Node.TEXT_NODE: buff += node.nodeValue elif node.nodeType == xml.dom.Node.ELEMENT_NODE: buff += self.text_to_string(node) return buff