Source code for textract.parsers.tesseract
"""
Process an image file using tesseract.
"""
import os
from .utils import ShellParser
[docs]class Parser(ShellParser):
"""Extract text from various image file formats using tesseract-ocr"""
[docs] def extract(self, filename, **kwargs):
# Tesseract can't output to console directly so you must first create
# a dummy file to write to, read, and then delete
devnull = os.devnull
command = (
'tesseract "%(filename)s" {0} > %(devnull)s && '
'cat {0}.txt && '
'rm -f {0} {0}.txt'
)
temp_filename = self.temp_filename()
stdout, _ = self.run(command.format(temp_filename) % locals())
return stdout