Source code for textract.parsers.pptx_parser

import pptx


[docs]def extract(filename, **kwargs):
    """Extract text from pptx file using python-pptx
    """
    presentation = pptx.Presentation(filename)
    text_runs = []
    for slide in presentation.slides:
        for shape in slide.shapes:
            if not shape.has_textframe:
                continue
            for paragraph in shape.textframe.paragraphs:
                for run in paragraph.runs:
                    text_runs.append(run.text.encode('utf-8'))
    return '\n\n'.join(text_runs)