Source code for textract.parsers.tesseract

"""
Process an image file using tesseract.
"""
import os

from .utils import ShellParser


[docs]class Parser(ShellParser): """Extract text from various image file formats using tesseract-ocr"""
[docs] def extract(self, filename, **kwargs): # Tesseract can't output to console directly so you must first create # a dummy file to write to, read, and then delete devnull = os.devnull command = ( 'tesseract "%(filename)s" {0} > %(devnull)s && ' 'cat {0}.txt && ' 'rm -f {0} {0}.txt' ) temp_filename = self.temp_filename() stdout, _ = self.run(command.format(temp_filename) % locals()) return stdout