Source code for textract.parsers.odt_parser

import sys
import zipfile
import xml.dom.minidom
import StringIO


def extract(filename, **kwargs):
[docs]    """Extract text from open document files.
    """
    s = StringIO.StringIO(file(filename).read())
    odt = OpenDocumentTextFile(s)
    return odt.toString().encode('ascii', 'replace')


class OpenDocumentTextFile:
[docs]    # inspiration from
    # https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py
    def __init__(self, filepath):
        zip = zipfile.ZipFile(filepath)
        self.content = xml.dom.minidom.parseString(zip.read("content.xml"))

    def toString(self):
[docs]        """ Converts the document to a string. """
        buffer = u""
        for val in ["text:p", "text:h", "text:list"]:
            for paragraph in self.content.getElementsByTagName(val):
                buffer += self.textToString(paragraph) + "\n"
        return buffer

    def textToString(self, element):
[docs]        buffer = u""
        for node in element.childNodes:
            if node.nodeType == xml.dom.Node.TEXT_NODE:
                buffer += node.nodeValue
            elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
                buffer += self.textToString(node)
        return buffer