tesseract_utils.py (1440B)
1# ... 2# 3# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org> 4# 5# This work is licensed under the terms of the GNU GPL, version 2 or 6# later. See the COPYING file in the top-level directory. 7 8import re 9import logging 10 11from avocado.utils import process 12from avocado.utils.path import find_command, CmdNotFoundError 13 14def tesseract_available(expected_version): 15 try: 16 find_command('tesseract') 17 except CmdNotFoundError: 18 return False 19 res = process.run('tesseract --version') 20 try: 21 version = res.stdout_text.split()[1] 22 except IndexError: 23 version = res.stderr_text.split()[1] 24 return int(version.split('.')[0]) == expected_version 25 26 match = re.match(r'tesseract\s(\d)', res) 27 if match is None: 28 return False 29 # now this is guaranteed to be a digit 30 return int(match.groups()[0]) == expected_version 31 32 33def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3): 34 console_logger = logging.getLogger('tesseract') 35 console_logger.debug(image_path) 36 if tesseract_version == 4: 37 tesseract_args += ' --oem 1' 38 proc = process.run("tesseract {} {} stdout".format(tesseract_args, 39 image_path)) 40 lines = [] 41 for line in proc.stdout_text.split('\n'): 42 sline = line.strip() 43 if len(sline): 44 console_logger.debug(sline) 45 lines += [sline] 46 return lines