Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-pytesseract for openSUSE:Factory checked in at 2021-03-02 12:34:31 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-pytesseract (Old) and /work/SRC/openSUSE:Factory/.python-pytesseract.new.2378 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-pytesseract" Tue Mar 2 12:34:31 2021 rev:10 rq:875927 version:0.3.7 Changes: -------- --- /work/SRC/openSUSE:Factory/python-pytesseract/python-pytesseract.changes 2020-05-19 14:44:14.127525669 +0200 +++ /work/SRC/openSUSE:Factory/.python-pytesseract.new.2378/python-pytesseract.changes 2021-03-02 12:45:54.184378670 +0100 @@ -1,0 +2,7 @@ +Mon Mar 1 02:53:00 UTC 2021 - John Vandenberg <[email protected]> + +- Update license to Apache-2.0 +- Update to v0.3.7 + * no upstream changelog + +------------------------------------------------------------------- Old: ---- v0.3.4.tar.gz New: ---- v0.3.7.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-pytesseract.spec ++++++ --- /var/tmp/diff_new_pack.dz5U2v/_old 2021-03-02 12:45:56.776380911 +0100 +++ /var/tmp/diff_new_pack.dz5U2v/_new 2021-03-02 12:45:56.780380915 +0100 @@ -1,7 +1,7 @@ # # spec file for package python-pytesseract # -# Copyright (c) 2020 SUSE LLC +# Copyright (c) 2021 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -18,10 +18,10 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} Name: python-pytesseract -Version: 0.3.4 +Version: 0.3.7 Release: 0 Summary: Python wrapper for Google's Tesseract-OCR -License: GPL-3.0-only +License: Apache-2.0 Group: Development/Languages/Python URL: https://github.com/madmaze/python-tesseract # https://github.com/madmaze/pytesseract/issues/262 ++++++ v0.3.4.tar.gz -> v0.3.7.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/.github/workflows/ci.yaml new/pytesseract-0.3.7/.github/workflows/ci.yaml --- old/pytesseract-0.3.4/.github/workflows/ci.yaml 1970-01-01 01:00:00.000000000 +0100 +++ new/pytesseract-0.3.7/.github/workflows/ci.yaml 2020-11-20 08:37:05.000000000 +0100 @@ -0,0 +1,65 @@ +name: CI + +on: + push: + branches: + - master + - '*.x' + pull_request: + branches: + - master + - '*.x' + +jobs: + tests: + if: "!contains(github.event.head_commit.message, '[skip ci]')" + name: ${{ matrix.name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: true + matrix: + include: + - {name: '3.9', python: '3.9', os: ubuntu-20.04, tox: py39} + - {name: '3.8', python: '3.8', os: ubuntu-18.04, tox: py38} + - {name: '3.7', python: '3.7', os: ubuntu-18.04, tox: py37} + - {name: '3.6', python: '3.6', os: ubuntu-16.04, tox: py36} + steps: + - uses: actions/checkout@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + + - name: Update pip + run: python -m pip install --upgrade pip wheel setuptools + + - name: Get pip cache dir + id: pip-cache + run: echo "::set-output name=dir::$(pip cache dir)" + + - name: Cache pip + uses: actions/cache@v2 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: pip-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('setup.py') }} + restore-keys: pip-${{ runner.os }}-${{ matrix.python }}- + + - name: Set Python version + # See https://pre-commit.com/#github-actions-example + run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV + + - name: Install tesseract + run: sudo apt-get install -y tesseract-ocr tesseract-ocr-fra + + - name: Print tesseract version + run: echo $(tesseract --version) + + - name: Install tox + run: python -m pip install --upgrade tox + + - name: Run tox + run: tox -e ${{ matrix.tox }} + env: + PY_COLORS: 1 + TOX_TESTENV_PASSENV: PY_COLORS diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/.pre-commit-config.yaml new/pytesseract-0.3.7/.pre-commit-config.yaml --- old/pytesseract-0.3.4/.pre-commit-config.yaml 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/.pre-commit-config.yaml 2020-11-20 08:37:05.000000000 +0100 @@ -1,7 +1,12 @@ exclude: ^(tests/data/) repos: +- repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black + args: [-S, --line-length=79, --safe, --quiet] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 + rev: v3.3.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -11,28 +16,24 @@ - id: name-tests-test - id: requirements-txt-fixer - id: double-quote-string-fixer +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.4 + hooks: + - id: flake8 +- repo: https://github.com/asottile/reorder_python_imports + rev: v2.3.6 + hooks: + - id: reorder-python-imports + args: [--py3-plus] - repo: https://github.com/asottile/pyupgrade - rev: v1.25.1 + rev: v2.7.3 hooks: - id: pyupgrade - repo: https://github.com/asottile/add-trailing-comma - rev: v1.5.0 + rev: v2.0.1 hooks: - id: add-trailing-comma -- repo: https://github.com/timothycrosley/isort - rev: 4.3.21 - hooks: - - id: isort - repo: https://github.com/pre-commit/mirrors-autopep8 - rev: v1.4.4 + rev: v1.5.4 hooks: - id: autopep8 -- repo: https://github.com/psf/black - rev: 19.10b0 - hooks: - - id: black - args: [-S, --line-length=79, --safe, --quiet] -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.9 - hooks: - - id: flake8 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/.travis.yml new/pytesseract-0.3.7/.travis.yml --- old/pytesseract-0.3.4/.travis.yml 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/.travis.yml 1970-01-01 01:00:00.000000000 +0100 @@ -1,53 +0,0 @@ -language: python - -addons: - apt: - config: - retries: true - -cache: - apt: true - pip: true - -matrix: - include: - - os: linux - dist: xenial - python: 2.7 - env: TOXENV=py27 - name: "2.7 Xenial" - - os: linux - dist: xenial - python: 3.5 - env: TOXENV=py35 - name: "3.5 Xenial" - - os: linux - dist: xenial - python: 3.6 - env: TOXENV=py36 - name: "3.6 Xenial" - - os: linux - dist: xenial - python: 3.7 - env: TOXENV=py37 - name: "3.7 Xenial" - - os: linux - dist: bionic - python: 3.8 - env: TOXENV=py38-pre-commit - name: "3.8 Bionic" - -before_install: - - sudo apt-get install -y tesseract-ocr - - sudo apt-get install -y tesseract-ocr-fra - - tesseract --version - - tesseract --list-langs - -install: - pip install tox - -script: - tox - -notifications: - email: false diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/README.rst new/pytesseract-0.3.7/README.rst --- old/pytesseract-0.3.4/README.rst 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/README.rst 2020-11-20 08:37:05.000000000 +0100 @@ -17,9 +17,13 @@ :target: https://anaconda.org/conda-forge/pytesseract :alt: Conda release -.. image:: https://travis-ci.org/madmaze/pytesseract.svg - :target: https://travis-ci.org/madmaze/pytesseract - :alt: Travis build status +.. image:: https://results.pre-commit.ci/badge/github/madmaze/pytesseract/master.svg + :target: https://results.pre-commit.ci/latest/github/madmaze/pytesseract/master + :alt: Pre-commit CI status + +.. image:: https://github.com/madmaze/pytesseract/workflows/CI/badge.svg?branch=master + :target: https://github.com/madmaze/pytesseract/actions?query=workflow%3ACI + :alt: CI workflow status Python-tesseract is an optical character recognition (OCR) tool for python. That is, it will recognize and "read" the text embedded in images. @@ -54,6 +58,9 @@ # Simple image to string print(pytesseract.image_to_string(Image.open('test.png'))) + # List of available languages + print(pytesseract.get_languages(config='')) + # French text image to string print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra')) @@ -89,6 +96,9 @@ # Get HOCR output hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr') + # Get ALTO XML output + xml = pytesseract.image_to_alto_xml('test.png') + Support for OpenCV image/NumPy array objects .. code-block:: python @@ -125,9 +135,11 @@ **Functions** +* **get_languages** Returns all currently supported languages by Tesseract OCR. + * **get_tesseract_version** Returns the Tesseract version installed in the system. -* **image_to_string** Returns the result of a Tesseract OCR run on the image to string +* **image_to_string** Returns unmodified output as string from Tesseract OCR processing * **image_to_boxes** Returns result containing recognized characters and their box boundaries @@ -135,6 +147,8 @@ * **image_to_osd** Returns result containing information about orientation and script detection. +* **image_to_alto_xml** Returns result in the form of Tesseract's ALTO XML format. + * **run_and_get_output** Returns the raw output from Tesseract OCR. Gives a bit more control over the parameters that are sent to tesseract. **Parameters** @@ -166,7 +180,7 @@ Prerequisites: -- Python-tesseract requires Python 2.7 or Python 3.5+ +- Python-tesseract requires Python 2.7 or Python 3.6+ - You will need the Python Imaging Library (PIL) (or the `Pillow <https://pypi.org/project/Pillow/>`_ fork). Under Debian/Ubuntu, this is the package **python-imaging** or **python3-imaging**. - Install `Google Tesseract OCR <https://github.com/tesseract-ocr/tesseract>`_ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/setup.cfg new/pytesseract-0.3.7/setup.cfg --- old/pytesseract-0.3.4/setup.cfg 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/setup.cfg 2020-11-20 08:37:05.000000000 +0100 @@ -1,8 +1,2 @@ [bdist_wheel] universal = True - -[isort] -indent = ' ' -line_length = 79 -multi_line_output = 3 -include_trailing_comma = True diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/setup.py new/pytesseract-0.3.7/setup.py --- old/pytesseract-0.3.4/setup.py 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/setup.py 2020-11-20 08:37:05.000000000 +0100 @@ -14,7 +14,7 @@ setup( name=PACKAGE_NAME, - version='0.3.4', + version='0.3.7', author='Samuel Hoffstaetter', author_email='[email protected]', maintainer='Matthias Lee', @@ -36,12 +36,10 @@ classifiers=[ 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], ) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/src/__init__.py new/pytesseract-0.3.7/src/__init__.py --- old/pytesseract-0.3.4/src/__init__.py 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/src/__init__.py 2020-11-20 08:37:05.000000000 +0100 @@ -1,13 +1,15 @@ -from .pytesseract import ( # noqa: F401 - Output, - TesseractError, - TesseractNotFoundError, - TSVNotSupported, - get_tesseract_version, - image_to_boxes, - image_to_data, - image_to_osd, - image_to_pdf_or_hocr, - image_to_string, - run_and_get_output, -) +# flake8: noqa: F401 +from .pytesseract import ALTONotSupported +from .pytesseract import get_languages +from .pytesseract import get_tesseract_version +from .pytesseract import image_to_alto_xml +from .pytesseract import image_to_boxes +from .pytesseract import image_to_data +from .pytesseract import image_to_osd +from .pytesseract import image_to_pdf_or_hocr +from .pytesseract import image_to_string +from .pytesseract import Output +from .pytesseract import run_and_get_output +from .pytesseract import TesseractError +from .pytesseract import TesseractNotFoundError +from .pytesseract import TSVNotSupported diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/src/pytesseract.py new/pytesseract-0.3.7/src/pytesseract.py --- old/pytesseract-0.3.4/src/pytesseract.py 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/src/pytesseract.py 2020-11-20 08:37:05.000000000 +0100 @@ -1,5 +1,5 @@ #!/usr/bin/env python - +import re import shlex import string import subprocess @@ -11,11 +11,17 @@ from functools import wraps from glob import iglob from io import BytesIO -from os import environ, extsep, remove -from os.path import normcase, normpath, realpath +from os import environ +from os import extsep +from os import linesep +from os import remove +from os.path import normcase +from os.path import normpath +from os.path import realpath from pkgutil import find_loader from tempfile import NamedTemporaryFile from threading import Timer +from time import sleep try: from PIL import Image @@ -33,6 +39,8 @@ if pandas_installed: import pandas as pd +DEFAULT_ENCODING = 'utf-8' +LANG_PATTERN = re.compile('^[a-z_]+$') RGB_MODE = 'RGB' SUPPORTED_FORMATS = { 'JPEG', @@ -78,7 +86,8 @@ class TesseractNotFoundError(EnvironmentError): def __init__(self): super(TesseractNotFoundError, self).__init__( - tesseract_cmd + " is not installed or it's not in your PATH", + f"{tesseract_cmd} is not installed or it's not in your PATH." + + ' See README file for more information.', ) @@ -89,9 +98,24 @@ ) +class ALTONotSupported(EnvironmentError): + def __init__(self): + super(ALTONotSupported, self).__init__( + 'ALTO output not supported. Tesseract >= 4.1.0 required', + ) + + def kill(process, code): - process.kill() - process.returncode = code + process.terminate() + try: + process.wait(1) + except TypeError: # python2 Popen.wait(1) fallback + sleep(1) + except Exception: # python3 subprocess.TimeoutExpired + pass + finally: + process.kill() + process.returncode = code @contextmanager @@ -130,7 +154,7 @@ def get_errors(error_string): return u' '.join( - line for line in error_string.decode('utf-8').splitlines() + line for line in error_string.decode(DEFAULT_ENCODING).splitlines() ).strip() @@ -155,19 +179,13 @@ if extension not in SUPPORTED_FORMATS: raise TypeError('Unsupported image format/type') - if not image.mode.startswith(RGB_MODE): - image = image.convert(RGB_MODE) - if 'A' in image.getbands(): # discard and replace the alpha channel with white background background = Image.new(RGB_MODE, image.size, (255, 255, 255)) - background.paste(image, (0, 0), image) + background.paste(image, (0, 0), image.getchannel('A')) image = background image.format = extension - if 'format' not in image.info: - image.info['format'] = extension - return image, extension @@ -178,10 +196,9 @@ if isinstance(image, str): yield f.name, realpath(normpath(normcase(image))) return - image, extension = prepare(image) input_file_name = f.name + extsep + extension - image.save(input_file_name, **image.info) + image.save(input_file_name, format=image.format) yield f.name, input_file_name finally: cleanup(f.name) @@ -231,7 +248,7 @@ if config: cmd_args += shlex.split(config) - if extension and extension not in {'box', 'osd', 'tsv'}: + if extension and extension not in {'box', 'osd', 'tsv', 'xml'}: cmd_args.append(extension) try: @@ -272,12 +289,12 @@ with open(filename, 'rb') as output_file: if return_bytes: return output_file.read() - return output_file.read().decode('utf-8').strip() + return output_file.read().decode(DEFAULT_ENCODING) def file_to_dict(tsv, cell_delimiter, str_col_idx): result = {} - rows = [row.split(cell_delimiter) for row in tsv.split('\n')] + rows = [row.split(cell_delimiter) for row in tsv.strip().split('\n')] if not rows: return result @@ -328,6 +345,35 @@ @run_once +def get_languages(config=''): + cmd_args = [tesseract_cmd, '--list-langs'] + if config: + cmd_args += shlex.split(config) + + try: + result = subprocess.run( + cmd_args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + except OSError: + raise TesseractNotFoundError() + + # tesseract 3.x + if result.returncode not in (0, 1): + raise TesseractNotFoundError() + + languages = [] + if result.stdout: + for line in result.stdout.decode(DEFAULT_ENCODING).split(linesep): + lang = line.strip() + if LANG_PATTERN.match(lang): + languages.append(lang) + + return languages + + +@run_once def get_tesseract_version(): """ Returns LooseVersion object of the Tesseract version @@ -335,9 +381,11 @@ try: return LooseVersion( subprocess.check_output( - [tesseract_cmd, '--version'], stderr=subprocess.STDOUT, + [tesseract_cmd, '--version'], + stderr=subprocess.STDOUT, + env=environ, ) - .decode('utf-8') + .decode(DEFAULT_ENCODING) .split()[1] .lstrip(string.printable[10:]), ) @@ -346,7 +394,12 @@ def image_to_string( - image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0, + image, + lang=None, + config='', + nice=0, + output_type=Output.STRING, + timeout=0, ): """ Returns the result of a Tesseract OCR run on the provided image to string @@ -361,32 +414,62 @@ def image_to_pdf_or_hocr( - image, lang=None, config='', nice=0, extension='pdf', timeout=0, + image, + lang=None, + config='', + nice=0, + extension='pdf', + timeout=0, ): """ Returns the result of a Tesseract OCR run on the provided image to pdf/hocr """ if extension not in {'pdf', 'hocr'}: - raise ValueError('Unsupported extension: {}'.format(extension)) + raise ValueError(f'Unsupported extension: {extension}') args = [image, extension, lang, config, nice, timeout, True] return run_and_get_output(*args) +def image_to_alto_xml( + image, + lang=None, + config='', + nice=0, + timeout=0, +): + """ + Returns the result of a Tesseract OCR run on the provided image to ALTO XML + """ + + if get_tesseract_version() < '4.1.0': + raise ALTONotSupported() + + config = f'-c tessedit_create_alto=1 {config.strip()}' + args = [image, 'xml', lang, config, nice, timeout, True] + + return run_and_get_output(*args) + + def image_to_boxes( - image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0, + image, + lang=None, + config='', + nice=0, + output_type=Output.STRING, + timeout=0, ): """ Returns string containing recognized characters and their box boundaries """ - config += ' batch.nochop makebox' + config = f'{config.strip()} batch.nochop makebox' args = [image, 'box', lang, config, nice, timeout] return { Output.BYTES: lambda: run_and_get_output(*(args + [True])), Output.DICT: lambda: file_to_dict( - 'char left bottom right top page\n' + run_and_get_output(*args), + f'char left bottom right top page\n{run_and_get_output(*args)}', ' ', 0, ), @@ -424,13 +507,14 @@ if get_tesseract_version() < '3.05': raise TSVNotSupported() - config = '{} {}'.format('-c tessedit_create_tsv=1', config.strip()).strip() + config = f'-c tessedit_create_tsv=1 {config.strip()}' args = [image, 'tsv', lang, config, nice, timeout] return { Output.BYTES: lambda: run_and_get_output(*(args + [True])), Output.DATAFRAME: lambda: get_pandas_output( - args + [True], pandas_config, + args + [True], + pandas_config, ), Output.DICT: lambda: file_to_dict(run_and_get_output(*args), '\t', -1), Output.STRING: lambda: run_and_get_output(*args), @@ -438,14 +522,18 @@ def image_to_osd( - image, lang='osd', config='', nice=0, output_type=Output.STRING, timeout=0, + image, + lang='osd', + config='', + nice=0, + output_type=Output.STRING, + timeout=0, ): """ Returns string containing the orientation and script detection (OSD) """ - config = '{}-psm 0 {}'.format( - '' if get_tesseract_version() < '3.05' else '-', config.strip(), - ).strip() + psm_dash = '' if get_tesseract_version() < '3.05' else '-' + config = f'{psm_dash}-psm 0 {config.strip()}' args = [image, 'osd', lang, config, nice, timeout] return { @@ -461,19 +549,19 @@ elif len(sys.argv) == 4 and sys.argv[1] == '-l': filename, lang = sys.argv[3], sys.argv[2] else: - sys.stderr.write('Usage: pytesseract [-l lang] input_file\n') - exit(2) + print('Usage: pytesseract [-l lang] input_file\n', file=sys.stderr) + return 2 try: with Image.open(filename) as img: print(image_to_string(img, lang=lang)) except TesseractNotFoundError as e: - sys.stderr.write('{}\n'.format(str(e))) - exit(1) - except IOError: - sys.stderr.write('ERROR: Could not open file "%s"\n' % filename) - exit(1) + print(f'{str(e)}\n', file=sys.stderr) + return 1 + except IOError as e: + print(f'{type(e).__name__}: {e}', file=sys.stderr) + return 1 if __name__ == '__main__': - main() + exit(main()) Binary files old/pytesseract-0.3.4/tests/data/test-small.jpg and new/pytesseract-0.3.7/tests/data/test-small.jpg differ Binary files old/pytesseract-0.3.4/tests/data/test_la.png and new/pytesseract-0.3.7/tests/data/test_la.png differ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/tests/pytesseract_test.py new/pytesseract-0.3.7/tests/pytesseract_test.py --- old/pytesseract-0.3.4/tests/pytesseract_test.py 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/tests/pytesseract_test.py 2020-11-20 08:37:05.000000000 +0100 @@ -1,23 +1,29 @@ # encoding: utf-8 from glob import iglob from multiprocessing import Pool -from os import getcwd, path, sep -from sys import platform, version_info +from os import getcwd +from os import path +from os import sep +from sys import platform +from sys import version_info from tempfile import gettempdir import pytest -from pytesseract import ( - Output, - TesseractNotFoundError, - TSVNotSupported, - get_tesseract_version, - image_to_boxes, - image_to_data, - image_to_osd, - image_to_pdf_or_hocr, - image_to_string, -) -from pytesseract.pytesseract import numpy_installed, pandas_installed, prepare +from pytesseract import ALTONotSupported +from pytesseract import get_languages +from pytesseract import get_tesseract_version +from pytesseract import image_to_alto_xml +from pytesseract import image_to_boxes +from pytesseract import image_to_data +from pytesseract import image_to_osd +from pytesseract import image_to_pdf_or_hocr +from pytesseract import image_to_string +from pytesseract import Output +from pytesseract import TesseractNotFoundError +from pytesseract import TSVNotSupported +from pytesseract.pytesseract import numpy_installed +from pytesseract.pytesseract import pandas_installed +from pytesseract.pytesseract import prepare if numpy_installed: import numpy as np @@ -36,7 +42,9 @@ TESSERACT_VERSION = tuple(get_tesseract_version().version) # to skip tests -DATA_DIR = path.join(path.dirname(path.abspath(__file__)), 'data') +TESTS_DIR = path.dirname(path.abspath(__file__)) +DATA_DIR = path.join(TESTS_DIR, 'data') +TESSDATA_DIR = path.join(TESTS_DIR, 'tessdata') TEST_JPEG = path.join(DATA_DIR, 'test.jpg') pytestmark = pytest.mark.pytesseract # used marker for the module @@ -58,6 +66,11 @@ return path.join(DATA_DIR, 'test-european.jpg') [email protected](scope='session') +def test_file_small(): + return path.join(DATA_DIR, 'test-small.jpg') + + @pytest.mark.parametrize( 'test_file', [ @@ -107,7 +120,8 @@ @pytest.mark.skipif(numpy_installed is False, reason='requires numpy') def test_image_to_string_with_numpy_array(test_file): assert 'The quick brown dog' in image_to_string( - np.array(Image.open(test_file)), 'eng', + np.array(Image.open(test_file)), + 'eng', ) @@ -117,7 +131,8 @@ @pytest.mark.skipif( - platform.startswith('win32'), reason='used paths with `/` as separator', + platform.startswith('win32'), + reason='used paths with `/` as separator', ) def test_image_to_string_batch(): batch_file = path.join(DATA_DIR, 'images.txt') @@ -148,6 +163,12 @@ image_to_string(test_file, timeout=0.000000001) +def test_la_image_to_string(): + filepath = path.join(DATA_DIR, 'test_la.png') + img = Image.open(filepath) + assert 'This is test message' == image_to_string(img).strip() + + def test_image_to_boxes(test_file): result = image_to_boxes(test_file) assert isinstance(result, string_type) @@ -203,20 +224,44 @@ @pytest.mark.skipif( - TESSERACT_VERSION[:2] >= (3, 5), reason='requires tesseract < 3.05', + TESSERACT_VERSION[:2] < (4, 1), + reason='requires tesseract >= 4.1', +) +def test_image_to_alto_xml(test_file): + result = image_to_alto_xml(test_file) + assert isinstance(result, bytes) + result = result.decode('utf-8') if IS_PYTHON_2 else str(result, 'utf-8') + result = str(result).strip() + assert result.startswith('<?xml') + assert result.endswith('</alto>') + + [email protected]( + TESSERACT_VERSION[:2] >= (4, 1), + reason='requires tesseract < 4.1', +) +def test_image_to_alto_xml_support(test_file): + with pytest.raises(ALTONotSupported): + image_to_alto_xml(test_file) + + [email protected]( + TESSERACT_VERSION[:2] >= (3, 5), + reason='requires tesseract < 3.05', ) -def test_image_to_data__pandas_support(test_file): +def test_image_to_data__pandas_support(test_file_small): with pytest.raises(TSVNotSupported): - image_to_data(test_file, output_type=Output.DATAFRAME) + image_to_data(test_file_small, output_type=Output.DATAFRAME) @pytest.mark.skipif( - TESSERACT_VERSION[:2] < (3, 5), reason='requires tesseract >= 3.05', + TESSERACT_VERSION[:2] < (3, 5), + reason='requires tesseract >= 3.05', ) @pytest.mark.skipif(pandas_installed is False, reason='requires pandas') -def test_image_to_data__pandas_output(test_file): +def test_image_to_data__pandas_output(test_file_small): """Test and compare the type and meta information of the result.""" - result = image_to_data(test_file, output_type=Output.DATAFRAME) + result = image_to_data(test_file_small, output_type=Output.DATAFRAME) assert isinstance(result, pandas.DataFrame) expected_columns = [ 'level', @@ -236,41 +281,44 @@ @pytest.mark.skipif( - TESSERACT_VERSION[:2] < (3, 5), reason='requires tesseract >= 3.05', + TESSERACT_VERSION[:2] < (3, 5), + reason='requires tesseract >= 3.05', ) @pytest.mark.parametrize( 'output', [Output.BYTES, Output.DICT, Output.STRING], ids=['bytes', 'dict', 'string'], ) -def test_image_to_data_common_output(test_file, output): +def test_image_to_data_common_output(test_file_small, output): """Test and compare the type of the result.""" - result = image_to_data(test_file, output_type=output) - expected_keys = [ - 'level', - 'page_num', - 'block_num', - 'par_num', - 'line_num', - 'word_num', - 'left', - 'top', - 'width', - 'height', - 'conf', - 'text', - ] + result = image_to_data(test_file_small, output_type=output) + expected_dict_result = { + 'level': [1, 2, 3, 4, 5], + 'page_num': [1, 1, 1, 1, 1], + 'block_num': [0, 1, 1, 1, 1], + 'par_num': [0, 0, 1, 1, 1], + 'line_num': [0, 0, 0, 1, 1], + 'word_num': [0, 0, 0, 0, 1], + 'left': [0, 11, 11, 11, 11], + 'top': [0, 11, 11, 11, 11], + 'width': [79, 60, 60, 60, 60], + 'height': [47, 24, 24, 24, 24], + # 'conf': ['-1', '-1', '-1', '-1', 96], + 'text': ['', '', '', '', 'This'], + } if output is Output.BYTES: assert isinstance(result, bytes) elif output is Output.DICT: - assert isinstance(result, dict) - assert bool(set(result.keys()).intersection(expected_keys)) + confidence_values = result.pop('conf', None) + assert confidence_values is not None + assert 0 <= confidence_values[-1] <= 100 + assert result == expected_dict_result elif output is Output.STRING: assert isinstance(result, string_type) - for key in expected_keys: + for key in expected_dict_result.keys(): assert key in result @@ -289,34 +337,49 @@ """Test wrong or missing tesseract command.""" import pytesseract - monkeypatch.setattr( - 'pytesseract.pytesseract.tesseract_cmd', test_path, - ) + monkeypatch.setattr('pytesseract.pytesseract.tesseract_cmd', test_path) + with pytest.raises(TesseractNotFoundError): - pytesseract.pytesseract.image_to_string(test_file) + pytesseract.get_languages.__wrapped__() + + with pytest.raises(TesseractNotFoundError): + pytesseract.get_tesseract_version.__wrapped__() + + with pytest.raises(TesseractNotFoundError): + pytesseract.image_to_string(test_file) def test_main_not_found_cases( - capsys, monkeypatch, test_file, test_invalid_file, + capsys, + monkeypatch, + test_file, + test_invalid_file, ): """Test wrong or missing tesseract command in main.""" import pytesseract monkeypatch.setattr('sys.argv', ['', test_invalid_file]) - with pytest.raises(SystemExit): - pytesseract.pytesseract.main() - assert capsys.readouterr().err.startswith('ERROR: Could not open file') + assert pytesseract.pytesseract.main() == 1 + captured_stderr = capsys.readouterr().err + assert ( + 'No such file or directory' in captured_stderr + and test_invalid_file in captured_stderr + ) monkeypatch.setattr( - 'pytesseract.pytesseract.tesseract_cmd', 'wrong_tesseract', + 'pytesseract.pytesseract.tesseract_cmd', + 'wrong_tesseract', ) monkeypatch.setattr('sys.argv', ['', test_file]) - with pytest.raises(SystemExit): - pytesseract.pytesseract.main() - assert capsys.readouterr().err.endswith( - "is not installed or it's not in your PATH\n", + assert pytesseract.pytesseract.main() == 1 + assert ( + "is not installed or it's not in your PATH" in capsys.readouterr().err ) + monkeypatch.setattr('sys.argv', ['']) + assert pytesseract.pytesseract.main() == 2 + assert 'Usage: pytesseract [-l lang] input_file' in capsys.readouterr().err + @pytest.mark.parametrize( 'test_path', @@ -328,9 +391,40 @@ import pytesseract monkeypatch.setattr( - 'pytesseract.pytesseract.tesseract_cmd', test_path, + 'pytesseract.pytesseract.tesseract_cmd', + test_path, ) + with pytest.raises( TesseractNotFoundError if IS_PYTHON_2 and test_path else OSError, ): - pytesseract.pytesseract.image_to_string(test_file) + pytesseract.image_to_string(test_file) + + +DEFAULT_LANGUAGES = ('fra', 'eng', 'osd') + + [email protected]( + 'test_config,expected', + [ + ('', DEFAULT_LANGUAGES), + ('--tessdata-dir {}/'.format(TESSDATA_DIR), ('dzo_test', 'eng')), + ('--tessdata-dir /dev/null', ()), + ('--tessdata-dir invalid_path/', ()), + ('--tessdata-dir=invalid_config/', DEFAULT_LANGUAGES), + ], + ids=[ + 'default_empty_config', + 'custom_tessdata_dir', + 'incorrect_tessdata_dir', + 'invalid_tessdata_dir', + 'invalid_config', + ], +) +def test_get_languages(test_config, expected): + result = get_languages.__wrapped__(test_config) + if not result: + assert result == [] + + for lang in expected: + assert lang in result Binary files old/pytesseract-0.3.4/tests/tessdata/dzo_test.traineddata and new/pytesseract-0.3.7/tests/tessdata/dzo_test.traineddata differ Binary files old/pytesseract-0.3.4/tests/tessdata/eng.traineddata and new/pytesseract-0.3.7/tests/tessdata/eng.traineddata differ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pytesseract-0.3.4/tox.ini new/pytesseract-0.3.7/tox.ini --- old/pytesseract-0.3.4/tox.ini 2020-04-18 16:27:16.000000000 +0200 +++ new/pytesseract-0.3.7/tox.ini 2020-11-20 08:37:05.000000000 +0100 @@ -1,14 +1,13 @@ [tox] envlist = - py27 - py35 py36 py37 - py38-pre-commit + py38 + py39 skip_missing_interpreters = true [pytest] -addopts = --strict-markers --verbose --cache-clear -p no:doctest +addopts = --strict-markers --verbose --cache-clear --color=yes -p no:doctest markers = pytesseract: Requires commandline pytesseract installed. lang_fra: Requires French (fra) pytesseract language. @@ -19,11 +18,10 @@ commands = python -bb -m pytest -[testenv:py38-pre-commit] +[testenv:py39] deps = numpy pandas -r{toxinidir}/requirements-dev.txt commands = - pre-commit run --all-files --show-diff-on-failure - python -bb -m pytest + python -bb -m pytest {posargs:tests}
