commit python-pytesseract for openSUSE:Factory

Source-Sync Tue, 02 Mar 2021 03:52:06 -0800

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package python-pytesseract for 
openSUSE:Factory checked in at 2021-03-02 12:34:31
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-pytesseract (Old)
 and      /work/SRC/openSUSE:Factory/.python-pytesseract.new.2378 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-pytesseract"

Tue Mar  2 12:34:31 2021 rev:10 rq:875927 version:0.3.7

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-pytesseract/python-pytesseract.changes    
2020-05-19 14:44:14.127525669 +0200
+++ 
/work/SRC/openSUSE:Factory/.python-pytesseract.new.2378/python-pytesseract.changes
  2021-03-02 12:45:54.184378670 +0100
@@ -1,0 +2,7 @@
+Mon Mar  1 02:53:00 UTC 2021 - John Vandenberg <[email protected]>
+
+- Update license to Apache-2.0
+- Update to v0.3.7
+  * no upstream changelog
+
+-------------------------------------------------------------------

Old:
----
  v0.3.4.tar.gz

New:
----
  v0.3.7.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-pytesseract.spec ++++++
--- /var/tmp/diff_new_pack.dz5U2v/_old  2021-03-02 12:45:56.776380911 +0100
+++ /var/tmp/diff_new_pack.dz5U2v/_new  2021-03-02 12:45:56.780380915 +0100
@@ -1,7 +1,7 @@
 #
 # spec file for package python-pytesseract
 #
-# Copyright (c) 2020 SUSE LLC
+# Copyright (c) 2021 SUSE LLC
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -18,10 +18,10 @@
 
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 Name:           python-pytesseract
-Version:        0.3.4
+Version:        0.3.7
 Release:        0
 Summary:        Python wrapper for Google's Tesseract-OCR
-License:        GPL-3.0-only
+License:        Apache-2.0
 Group:          Development/Languages/Python
 URL:            https://github.com/madmaze/python-tesseract
 # https://github.com/madmaze/pytesseract/issues/262

++++++ v0.3.4.tar.gz -> v0.3.7.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/.github/workflows/ci.yaml 
new/pytesseract-0.3.7/.github/workflows/ci.yaml
--- old/pytesseract-0.3.4/.github/workflows/ci.yaml     1970-01-01 
01:00:00.000000000 +0100
+++ new/pytesseract-0.3.7/.github/workflows/ci.yaml     2020-11-20 
08:37:05.000000000 +0100
@@ -0,0 +1,65 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - master
+      - '*.x'
+  pull_request:
+    branches:
+      - master
+      - '*.x'
+
+jobs:
+  tests:
+    if: "!contains(github.event.head_commit.message, '[skip ci]')"
+    name: ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - {name: '3.9', python: '3.9', os: ubuntu-20.04, tox: py39}
+          - {name: '3.8', python: '3.8', os: ubuntu-18.04, tox: py38}
+          - {name: '3.7', python: '3.7', os: ubuntu-18.04, tox: py37}
+          - {name: '3.6', python: '3.6', os: ubuntu-16.04, tox: py36}
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Update pip
+        run: python -m pip install --upgrade pip wheel setuptools
+
+      - name: Get pip cache dir
+        id: pip-cache
+        run: echo "::set-output name=dir::$(pip cache dir)"
+
+      - name: Cache pip
+        uses: actions/cache@v2
+        with:
+          path: ${{ steps.pip-cache.outputs.dir }}
+          key: pip-${{ runner.os }}-${{ matrix.python }}-${{ 
hashFiles('setup.py') }}
+          restore-keys: pip-${{ runner.os }}-${{ matrix.python }}-
+
+      - name: Set Python version
+        # See https://pre-commit.com/#github-actions-example
+        run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
+
+      - name: Install tesseract
+        run: sudo apt-get install -y tesseract-ocr tesseract-ocr-fra
+
+      - name: Print tesseract version
+        run: echo $(tesseract --version)
+
+      - name: Install tox
+        run: python -m pip install --upgrade tox
+
+      - name: Run tox
+        run: tox -e ${{ matrix.tox }}
+        env:
+          PY_COLORS: 1
+          TOX_TESTENV_PASSENV: PY_COLORS
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/.pre-commit-config.yaml 
new/pytesseract-0.3.7/.pre-commit-config.yaml
--- old/pytesseract-0.3.4/.pre-commit-config.yaml       2020-04-18 
16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/.pre-commit-config.yaml       2020-11-20 
08:37:05.000000000 +0100
@@ -1,7 +1,12 @@
 exclude: ^(tests/data/)
 repos:
+-   repo: https://github.com/psf/black
+    rev: 20.8b1
+    hooks:
+    -   id: black
+        args: [-S, --line-length=79, --safe, --quiet]
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.4.0
+    rev: v3.3.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
@@ -11,28 +16,24 @@
     -   id: name-tests-test
     -   id: requirements-txt-fixer
     -   id: double-quote-string-fixer
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.8.4
+    hooks:
+    -   id: flake8
+-   repo: https://github.com/asottile/reorder_python_imports
+    rev: v2.3.6
+    hooks:
+    -   id: reorder-python-imports
+        args: [--py3-plus]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v1.25.1
+    rev: v2.7.3
     hooks:
     -   id: pyupgrade
 -   repo: https://github.com/asottile/add-trailing-comma
-    rev: v1.5.0
+    rev: v2.0.1
     hooks:
     -   id: add-trailing-comma
--   repo: https://github.com/timothycrosley/isort
-    rev: 4.3.21
-    hooks:
-    -   id: isort
 -   repo: https://github.com/pre-commit/mirrors-autopep8
-    rev: v1.4.4
+    rev: v1.5.4
     hooks:
     -   id: autopep8
--   repo: https://github.com/psf/black
-    rev: 19.10b0
-    hooks:
-    -   id: black
-        args: [-S, --line-length=79, --safe, --quiet]
--   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.7.9
-    hooks:
-    -   id: flake8
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/.travis.yml 
new/pytesseract-0.3.7/.travis.yml
--- old/pytesseract-0.3.4/.travis.yml   2020-04-18 16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/.travis.yml   1970-01-01 01:00:00.000000000 +0100
@@ -1,53 +0,0 @@
-language: python
-
-addons:
-  apt:
-    config:
-      retries: true
-
-cache:
-  apt: true
-  pip: true
-
-matrix:
-  include:
-    - os: linux
-      dist: xenial
-      python: 2.7
-      env: TOXENV=py27
-      name: "2.7 Xenial"
-    - os: linux
-      dist: xenial
-      python: 3.5
-      env: TOXENV=py35
-      name: "3.5 Xenial"
-    - os: linux
-      dist: xenial
-      python: 3.6
-      env: TOXENV=py36
-      name: "3.6 Xenial"
-    - os: linux
-      dist: xenial
-      python: 3.7
-      env: TOXENV=py37
-      name: "3.7 Xenial"
-    - os: linux
-      dist: bionic
-      python: 3.8
-      env: TOXENV=py38-pre-commit
-      name: "3.8 Bionic"
-
-before_install:
-  - sudo apt-get install -y tesseract-ocr
-  - sudo apt-get install -y tesseract-ocr-fra
-  - tesseract --version
-  - tesseract --list-langs
-
-install:
-  pip install tox
-
-script:
-  tox
-
-notifications:
-  email: false
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/README.rst 
new/pytesseract-0.3.7/README.rst
--- old/pytesseract-0.3.4/README.rst    2020-04-18 16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/README.rst    2020-11-20 08:37:05.000000000 +0100
@@ -17,9 +17,13 @@
    :target: https://anaconda.org/conda-forge/pytesseract
    :alt: Conda release
 
-.. image:: https://travis-ci.org/madmaze/pytesseract.svg
-    :target: https://travis-ci.org/madmaze/pytesseract
-    :alt: Travis build status
+.. image:: 
https://results.pre-commit.ci/badge/github/madmaze/pytesseract/master.svg
+   :target: 
https://results.pre-commit.ci/latest/github/madmaze/pytesseract/master
+   :alt: Pre-commit CI status
+
+.. image:: 
https://github.com/madmaze/pytesseract/workflows/CI/badge.svg?branch=master
+    :target: https://github.com/madmaze/pytesseract/actions?query=workflow%3ACI
+    :alt: CI workflow status
 
 Python-tesseract is an optical character recognition (OCR) tool for python.
 That is, it will recognize and "read" the text embedded in images.
@@ -54,6 +58,9 @@
     # Simple image to string
     print(pytesseract.image_to_string(Image.open('test.png')))
 
+    # List of available languages
+    print(pytesseract.get_languages(config=''))
+
     # French text image to string
     print(pytesseract.image_to_string(Image.open('test-european.jpg'), 
lang='fra'))
 
@@ -89,6 +96,9 @@
     # Get HOCR output
     hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')
 
+    # Get ALTO XML output
+    xml = pytesseract.image_to_alto_xml('test.png')
+
 Support for OpenCV image/NumPy array objects
 
 .. code-block:: python
@@ -125,9 +135,11 @@
 
 **Functions**
 
+* **get_languages** Returns all currently supported languages by Tesseract OCR.
+
 * **get_tesseract_version** Returns the Tesseract version installed in the 
system.
 
-* **image_to_string** Returns the result of a Tesseract OCR run on the image 
to string
+* **image_to_string** Returns unmodified output as string from Tesseract OCR 
processing
 
 * **image_to_boxes** Returns result containing recognized characters and their 
box boundaries
 
@@ -135,6 +147,8 @@
 
 * **image_to_osd** Returns result containing information about orientation and 
script detection.
 
+* **image_to_alto_xml** Returns result in the form of Tesseract's ALTO XML 
format.
+
 * **run_and_get_output** Returns the raw output from Tesseract OCR. Gives a 
bit more control over the parameters that are sent to tesseract.
 
 **Parameters**
@@ -166,7 +180,7 @@
 
 Prerequisites:
 
-- Python-tesseract requires Python 2.7 or Python 3.5+
+- Python-tesseract requires Python 2.7 or Python 3.6+
 - You will need the Python Imaging Library (PIL) (or the `Pillow 
<https://pypi.org/project/Pillow/>`_ fork).
   Under Debian/Ubuntu, this is the package **python-imaging** or 
**python3-imaging**.
 - Install `Google Tesseract OCR <https://github.com/tesseract-ocr/tesseract>`_
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/setup.cfg 
new/pytesseract-0.3.7/setup.cfg
--- old/pytesseract-0.3.4/setup.cfg     2020-04-18 16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/setup.cfg     2020-11-20 08:37:05.000000000 +0100
@@ -1,8 +1,2 @@
 [bdist_wheel]
 universal = True
-
-[isort]
-indent = '    '
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = True
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/setup.py 
new/pytesseract-0.3.7/setup.py
--- old/pytesseract-0.3.4/setup.py      2020-04-18 16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/setup.py      2020-11-20 08:37:05.000000000 +0100
@@ -14,7 +14,7 @@
 
 setup(
     name=PACKAGE_NAME,
-    version='0.3.4',
+    version='0.3.7',
     author='Samuel Hoffstaetter',
     author_email='[email protected]',
     maintainer='Matthias Lee',
@@ -36,12 +36,10 @@
     classifiers=[
         'License :: OSI Approved :: Apache Software License',
         'Programming Language :: Python',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
     ],
 )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/src/__init__.py 
new/pytesseract-0.3.7/src/__init__.py
--- old/pytesseract-0.3.4/src/__init__.py       2020-04-18 16:27:16.000000000 
+0200
+++ new/pytesseract-0.3.7/src/__init__.py       2020-11-20 08:37:05.000000000 
+0100
@@ -1,13 +1,15 @@
-from .pytesseract import (  # noqa: F401
-    Output,
-    TesseractError,
-    TesseractNotFoundError,
-    TSVNotSupported,
-    get_tesseract_version,
-    image_to_boxes,
-    image_to_data,
-    image_to_osd,
-    image_to_pdf_or_hocr,
-    image_to_string,
-    run_and_get_output,
-)
+# flake8: noqa: F401
+from .pytesseract import ALTONotSupported
+from .pytesseract import get_languages
+from .pytesseract import get_tesseract_version
+from .pytesseract import image_to_alto_xml
+from .pytesseract import image_to_boxes
+from .pytesseract import image_to_data
+from .pytesseract import image_to_osd
+from .pytesseract import image_to_pdf_or_hocr
+from .pytesseract import image_to_string
+from .pytesseract import Output
+from .pytesseract import run_and_get_output
+from .pytesseract import TesseractError
+from .pytesseract import TesseractNotFoundError
+from .pytesseract import TSVNotSupported
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/src/pytesseract.py 
new/pytesseract-0.3.7/src/pytesseract.py
--- old/pytesseract-0.3.4/src/pytesseract.py    2020-04-18 16:27:16.000000000 
+0200
+++ new/pytesseract-0.3.7/src/pytesseract.py    2020-11-20 08:37:05.000000000 
+0100
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-
+import re
 import shlex
 import string
 import subprocess
@@ -11,11 +11,17 @@
 from functools import wraps
 from glob import iglob
 from io import BytesIO
-from os import environ, extsep, remove
-from os.path import normcase, normpath, realpath
+from os import environ
+from os import extsep
+from os import linesep
+from os import remove
+from os.path import normcase
+from os.path import normpath
+from os.path import realpath
 from pkgutil import find_loader
 from tempfile import NamedTemporaryFile
 from threading import Timer
+from time import sleep
 
 try:
     from PIL import Image
@@ -33,6 +39,8 @@
 if pandas_installed:
     import pandas as pd
 
+DEFAULT_ENCODING = 'utf-8'
+LANG_PATTERN = re.compile('^[a-z_]+$')
 RGB_MODE = 'RGB'
 SUPPORTED_FORMATS = {
     'JPEG',
@@ -78,7 +86,8 @@
 class TesseractNotFoundError(EnvironmentError):
     def __init__(self):
         super(TesseractNotFoundError, self).__init__(
-            tesseract_cmd + " is not installed or it's not in your PATH",
+            f"{tesseract_cmd} is not installed or it's not in your PATH."
+            + ' See README file for more information.',
         )
 
 
@@ -89,9 +98,24 @@
         )
 
 
+class ALTONotSupported(EnvironmentError):
+    def __init__(self):
+        super(ALTONotSupported, self).__init__(
+            'ALTO output not supported. Tesseract >= 4.1.0 required',
+        )
+
+
 def kill(process, code):
-    process.kill()
-    process.returncode = code
+    process.terminate()
+    try:
+        process.wait(1)
+    except TypeError:  # python2 Popen.wait(1) fallback
+        sleep(1)
+    except Exception:  # python3 subprocess.TimeoutExpired
+        pass
+    finally:
+        process.kill()
+        process.returncode = code
 
 
 @contextmanager
@@ -130,7 +154,7 @@
 
 def get_errors(error_string):
     return u' '.join(
-        line for line in error_string.decode('utf-8').splitlines()
+        line for line in error_string.decode(DEFAULT_ENCODING).splitlines()
     ).strip()
 
 
@@ -155,19 +179,13 @@
     if extension not in SUPPORTED_FORMATS:
         raise TypeError('Unsupported image format/type')
 
-    if not image.mode.startswith(RGB_MODE):
-        image = image.convert(RGB_MODE)
-
     if 'A' in image.getbands():
         # discard and replace the alpha channel with white background
         background = Image.new(RGB_MODE, image.size, (255, 255, 255))
-        background.paste(image, (0, 0), image)
+        background.paste(image, (0, 0), image.getchannel('A'))
         image = background
 
     image.format = extension
-    if 'format' not in image.info:
-        image.info['format'] = extension
-
     return image, extension
 
 
@@ -178,10 +196,9 @@
             if isinstance(image, str):
                 yield f.name, realpath(normpath(normcase(image)))
                 return
-
             image, extension = prepare(image)
             input_file_name = f.name + extsep + extension
-            image.save(input_file_name, **image.info)
+            image.save(input_file_name, format=image.format)
             yield f.name, input_file_name
     finally:
         cleanup(f.name)
@@ -231,7 +248,7 @@
     if config:
         cmd_args += shlex.split(config)
 
-    if extension and extension not in {'box', 'osd', 'tsv'}:
+    if extension and extension not in {'box', 'osd', 'tsv', 'xml'}:
         cmd_args.append(extension)
 
     try:
@@ -272,12 +289,12 @@
         with open(filename, 'rb') as output_file:
             if return_bytes:
                 return output_file.read()
-            return output_file.read().decode('utf-8').strip()
+            return output_file.read().decode(DEFAULT_ENCODING)
 
 
 def file_to_dict(tsv, cell_delimiter, str_col_idx):
     result = {}
-    rows = [row.split(cell_delimiter) for row in tsv.split('\n')]
+    rows = [row.split(cell_delimiter) for row in tsv.strip().split('\n')]
     if not rows:
         return result
 
@@ -328,6 +345,35 @@
 
 
 @run_once
+def get_languages(config=''):
+    cmd_args = [tesseract_cmd, '--list-langs']
+    if config:
+        cmd_args += shlex.split(config)
+
+    try:
+        result = subprocess.run(
+            cmd_args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        )
+    except OSError:
+        raise TesseractNotFoundError()
+
+    # tesseract 3.x
+    if result.returncode not in (0, 1):
+        raise TesseractNotFoundError()
+
+    languages = []
+    if result.stdout:
+        for line in result.stdout.decode(DEFAULT_ENCODING).split(linesep):
+            lang = line.strip()
+            if LANG_PATTERN.match(lang):
+                languages.append(lang)
+
+    return languages
+
+
+@run_once
 def get_tesseract_version():
     """
     Returns LooseVersion object of the Tesseract version
@@ -335,9 +381,11 @@
     try:
         return LooseVersion(
             subprocess.check_output(
-                [tesseract_cmd, '--version'], stderr=subprocess.STDOUT,
+                [tesseract_cmd, '--version'],
+                stderr=subprocess.STDOUT,
+                env=environ,
             )
-            .decode('utf-8')
+            .decode(DEFAULT_ENCODING)
             .split()[1]
             .lstrip(string.printable[10:]),
         )
@@ -346,7 +394,12 @@
 
 
 def image_to_string(
-    image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0,
+    image,
+    lang=None,
+    config='',
+    nice=0,
+    output_type=Output.STRING,
+    timeout=0,
 ):
     """
     Returns the result of a Tesseract OCR run on the provided image to string
@@ -361,32 +414,62 @@
 
 
 def image_to_pdf_or_hocr(
-    image, lang=None, config='', nice=0, extension='pdf', timeout=0,
+    image,
+    lang=None,
+    config='',
+    nice=0,
+    extension='pdf',
+    timeout=0,
 ):
     """
     Returns the result of a Tesseract OCR run on the provided image to pdf/hocr
     """
 
     if extension not in {'pdf', 'hocr'}:
-        raise ValueError('Unsupported extension: {}'.format(extension))
+        raise ValueError(f'Unsupported extension: {extension}')
     args = [image, extension, lang, config, nice, timeout, True]
 
     return run_and_get_output(*args)
 
 
+def image_to_alto_xml(
+    image,
+    lang=None,
+    config='',
+    nice=0,
+    timeout=0,
+):
+    """
+    Returns the result of a Tesseract OCR run on the provided image to ALTO XML
+    """
+
+    if get_tesseract_version() < '4.1.0':
+        raise ALTONotSupported()
+
+    config = f'-c tessedit_create_alto=1 {config.strip()}'
+    args = [image, 'xml', lang, config, nice, timeout, True]
+
+    return run_and_get_output(*args)
+
+
 def image_to_boxes(
-    image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0,
+    image,
+    lang=None,
+    config='',
+    nice=0,
+    output_type=Output.STRING,
+    timeout=0,
 ):
     """
     Returns string containing recognized characters and their box boundaries
     """
-    config += ' batch.nochop makebox'
+    config = f'{config.strip()} batch.nochop makebox'
     args = [image, 'box', lang, config, nice, timeout]
 
     return {
         Output.BYTES: lambda: run_and_get_output(*(args + [True])),
         Output.DICT: lambda: file_to_dict(
-            'char left bottom right top page\n' + run_and_get_output(*args),
+            f'char left bottom right top page\n{run_and_get_output(*args)}',
             ' ',
             0,
         ),
@@ -424,13 +507,14 @@
     if get_tesseract_version() < '3.05':
         raise TSVNotSupported()
 
-    config = '{} {}'.format('-c tessedit_create_tsv=1', config.strip()).strip()
+    config = f'-c tessedit_create_tsv=1 {config.strip()}'
     args = [image, 'tsv', lang, config, nice, timeout]
 
     return {
         Output.BYTES: lambda: run_and_get_output(*(args + [True])),
         Output.DATAFRAME: lambda: get_pandas_output(
-            args + [True], pandas_config,
+            args + [True],
+            pandas_config,
         ),
         Output.DICT: lambda: file_to_dict(run_and_get_output(*args), '\t', -1),
         Output.STRING: lambda: run_and_get_output(*args),
@@ -438,14 +522,18 @@
 
 
 def image_to_osd(
-    image, lang='osd', config='', nice=0, output_type=Output.STRING, timeout=0,
+    image,
+    lang='osd',
+    config='',
+    nice=0,
+    output_type=Output.STRING,
+    timeout=0,
 ):
     """
     Returns string containing the orientation and script detection (OSD)
     """
-    config = '{}-psm 0 {}'.format(
-        '' if get_tesseract_version() < '3.05' else '-', config.strip(),
-    ).strip()
+    psm_dash = '' if get_tesseract_version() < '3.05' else '-'
+    config = f'{psm_dash}-psm 0 {config.strip()}'
     args = [image, 'osd', lang, config, nice, timeout]
 
     return {
@@ -461,19 +549,19 @@
     elif len(sys.argv) == 4 and sys.argv[1] == '-l':
         filename, lang = sys.argv[3], sys.argv[2]
     else:
-        sys.stderr.write('Usage: pytesseract [-l lang] input_file\n')
-        exit(2)
+        print('Usage: pytesseract [-l lang] input_file\n', file=sys.stderr)
+        return 2
 
     try:
         with Image.open(filename) as img:
             print(image_to_string(img, lang=lang))
     except TesseractNotFoundError as e:
-        sys.stderr.write('{}\n'.format(str(e)))
-        exit(1)
-    except IOError:
-        sys.stderr.write('ERROR: Could not open file "%s"\n' % filename)
-        exit(1)
+        print(f'{str(e)}\n', file=sys.stderr)
+        return 1
+    except IOError as e:
+        print(f'{type(e).__name__}: {e}', file=sys.stderr)
+        return 1
 
 
 if __name__ == '__main__':
-    main()
+    exit(main())
Binary files old/pytesseract-0.3.4/tests/data/test-small.jpg and 
new/pytesseract-0.3.7/tests/data/test-small.jpg differ
Binary files old/pytesseract-0.3.4/tests/data/test_la.png and 
new/pytesseract-0.3.7/tests/data/test_la.png differ
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/tests/pytesseract_test.py 
new/pytesseract-0.3.7/tests/pytesseract_test.py
--- old/pytesseract-0.3.4/tests/pytesseract_test.py     2020-04-18 
16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/tests/pytesseract_test.py     2020-11-20 
08:37:05.000000000 +0100
@@ -1,23 +1,29 @@
 # encoding: utf-8
 from glob import iglob
 from multiprocessing import Pool
-from os import getcwd, path, sep
-from sys import platform, version_info
+from os import getcwd
+from os import path
+from os import sep
+from sys import platform
+from sys import version_info
 from tempfile import gettempdir
 
 import pytest
-from pytesseract import (
-    Output,
-    TesseractNotFoundError,
-    TSVNotSupported,
-    get_tesseract_version,
-    image_to_boxes,
-    image_to_data,
-    image_to_osd,
-    image_to_pdf_or_hocr,
-    image_to_string,
-)
-from pytesseract.pytesseract import numpy_installed, pandas_installed, prepare
+from pytesseract import ALTONotSupported
+from pytesseract import get_languages
+from pytesseract import get_tesseract_version
+from pytesseract import image_to_alto_xml
+from pytesseract import image_to_boxes
+from pytesseract import image_to_data
+from pytesseract import image_to_osd
+from pytesseract import image_to_pdf_or_hocr
+from pytesseract import image_to_string
+from pytesseract import Output
+from pytesseract import TesseractNotFoundError
+from pytesseract import TSVNotSupported
+from pytesseract.pytesseract import numpy_installed
+from pytesseract.pytesseract import pandas_installed
+from pytesseract.pytesseract import prepare
 
 if numpy_installed:
     import numpy as np
@@ -36,7 +42,9 @@
 
 TESSERACT_VERSION = tuple(get_tesseract_version().version)  # to skip tests
 
-DATA_DIR = path.join(path.dirname(path.abspath(__file__)), 'data')
+TESTS_DIR = path.dirname(path.abspath(__file__))
+DATA_DIR = path.join(TESTS_DIR, 'data')
+TESSDATA_DIR = path.join(TESTS_DIR, 'tessdata')
 TEST_JPEG = path.join(DATA_DIR, 'test.jpg')
 
 pytestmark = pytest.mark.pytesseract  # used marker for the module
@@ -58,6 +66,11 @@
     return path.join(DATA_DIR, 'test-european.jpg')
 
 
[email protected](scope='session')
+def test_file_small():
+    return path.join(DATA_DIR, 'test-small.jpg')
+
+
 @pytest.mark.parametrize(
     'test_file',
     [
@@ -107,7 +120,8 @@
 @pytest.mark.skipif(numpy_installed is False, reason='requires numpy')
 def test_image_to_string_with_numpy_array(test_file):
     assert 'The quick brown dog' in image_to_string(
-        np.array(Image.open(test_file)), 'eng',
+        np.array(Image.open(test_file)),
+        'eng',
     )
 
 
@@ -117,7 +131,8 @@
 
 
 @pytest.mark.skipif(
-    platform.startswith('win32'), reason='used paths with `/` as separator',
+    platform.startswith('win32'),
+    reason='used paths with `/` as separator',
 )
 def test_image_to_string_batch():
     batch_file = path.join(DATA_DIR, 'images.txt')
@@ -148,6 +163,12 @@
         image_to_string(test_file, timeout=0.000000001)
 
 
+def test_la_image_to_string():
+    filepath = path.join(DATA_DIR, 'test_la.png')
+    img = Image.open(filepath)
+    assert 'This is test message' == image_to_string(img).strip()
+
+
 def test_image_to_boxes(test_file):
     result = image_to_boxes(test_file)
     assert isinstance(result, string_type)
@@ -203,20 +224,44 @@
 
 
 @pytest.mark.skipif(
-    TESSERACT_VERSION[:2] >= (3, 5), reason='requires tesseract < 3.05',
+    TESSERACT_VERSION[:2] < (4, 1),
+    reason='requires tesseract >= 4.1',
+)
+def test_image_to_alto_xml(test_file):
+    result = image_to_alto_xml(test_file)
+    assert isinstance(result, bytes)
+    result = result.decode('utf-8') if IS_PYTHON_2 else str(result, 'utf-8')
+    result = str(result).strip()
+    assert result.startswith('<?xml')
+    assert result.endswith('</alto>')
+
+
[email protected](
+    TESSERACT_VERSION[:2] >= (4, 1),
+    reason='requires tesseract < 4.1',
+)
+def test_image_to_alto_xml_support(test_file):
+    with pytest.raises(ALTONotSupported):
+        image_to_alto_xml(test_file)
+
+
[email protected](
+    TESSERACT_VERSION[:2] >= (3, 5),
+    reason='requires tesseract < 3.05',
 )
-def test_image_to_data__pandas_support(test_file):
+def test_image_to_data__pandas_support(test_file_small):
     with pytest.raises(TSVNotSupported):
-        image_to_data(test_file, output_type=Output.DATAFRAME)
+        image_to_data(test_file_small, output_type=Output.DATAFRAME)
 
 
 @pytest.mark.skipif(
-    TESSERACT_VERSION[:2] < (3, 5), reason='requires tesseract >= 3.05',
+    TESSERACT_VERSION[:2] < (3, 5),
+    reason='requires tesseract >= 3.05',
 )
 @pytest.mark.skipif(pandas_installed is False, reason='requires pandas')
-def test_image_to_data__pandas_output(test_file):
+def test_image_to_data__pandas_output(test_file_small):
     """Test and compare the type and meta information of the result."""
-    result = image_to_data(test_file, output_type=Output.DATAFRAME)
+    result = image_to_data(test_file_small, output_type=Output.DATAFRAME)
     assert isinstance(result, pandas.DataFrame)
     expected_columns = [
         'level',
@@ -236,41 +281,44 @@
 
 
 @pytest.mark.skipif(
-    TESSERACT_VERSION[:2] < (3, 5), reason='requires tesseract >= 3.05',
+    TESSERACT_VERSION[:2] < (3, 5),
+    reason='requires tesseract >= 3.05',
 )
 @pytest.mark.parametrize(
     'output',
     [Output.BYTES, Output.DICT, Output.STRING],
     ids=['bytes', 'dict', 'string'],
 )
-def test_image_to_data_common_output(test_file, output):
+def test_image_to_data_common_output(test_file_small, output):
     """Test and compare the type of the result."""
-    result = image_to_data(test_file, output_type=output)
-    expected_keys = [
-        'level',
-        'page_num',
-        'block_num',
-        'par_num',
-        'line_num',
-        'word_num',
-        'left',
-        'top',
-        'width',
-        'height',
-        'conf',
-        'text',
-    ]
+    result = image_to_data(test_file_small, output_type=output)
+    expected_dict_result = {
+        'level': [1, 2, 3, 4, 5],
+        'page_num': [1, 1, 1, 1, 1],
+        'block_num': [0, 1, 1, 1, 1],
+        'par_num': [0, 0, 1, 1, 1],
+        'line_num': [0, 0, 0, 1, 1],
+        'word_num': [0, 0, 0, 0, 1],
+        'left': [0, 11, 11, 11, 11],
+        'top': [0, 11, 11, 11, 11],
+        'width': [79, 60, 60, 60, 60],
+        'height': [47, 24, 24, 24, 24],
+        # 'conf': ['-1', '-1', '-1', '-1', 96],
+        'text': ['', '', '', '', 'This'],
+    }
 
     if output is Output.BYTES:
         assert isinstance(result, bytes)
 
     elif output is Output.DICT:
-        assert isinstance(result, dict)
-        assert bool(set(result.keys()).intersection(expected_keys))
+        confidence_values = result.pop('conf', None)
+        assert confidence_values is not None
+        assert 0 <= confidence_values[-1] <= 100
+        assert result == expected_dict_result
 
     elif output is Output.STRING:
         assert isinstance(result, string_type)
-        for key in expected_keys:
+        for key in expected_dict_result.keys():
             assert key in result
 
 
@@ -289,34 +337,49 @@
     """Test wrong or missing tesseract command."""
     import pytesseract
 
-    monkeypatch.setattr(
-        'pytesseract.pytesseract.tesseract_cmd', test_path,
-    )
+    monkeypatch.setattr('pytesseract.pytesseract.tesseract_cmd', test_path)
+
     with pytest.raises(TesseractNotFoundError):
-        pytesseract.pytesseract.image_to_string(test_file)
+        pytesseract.get_languages.__wrapped__()
+
+    with pytest.raises(TesseractNotFoundError):
+        pytesseract.get_tesseract_version.__wrapped__()
+
+    with pytest.raises(TesseractNotFoundError):
+        pytesseract.image_to_string(test_file)
 
 
 def test_main_not_found_cases(
-    capsys, monkeypatch, test_file, test_invalid_file,
+    capsys,
+    monkeypatch,
+    test_file,
+    test_invalid_file,
 ):
     """Test wrong or missing tesseract command in main."""
     import pytesseract
 
     monkeypatch.setattr('sys.argv', ['', test_invalid_file])
-    with pytest.raises(SystemExit):
-        pytesseract.pytesseract.main()
-    assert capsys.readouterr().err.startswith('ERROR: Could not open file')
+    assert pytesseract.pytesseract.main() == 1
+    captured_stderr = capsys.readouterr().err
+    assert (
+        'No such file or directory' in captured_stderr
+        and test_invalid_file in captured_stderr
+    )
 
     monkeypatch.setattr(
-        'pytesseract.pytesseract.tesseract_cmd', 'wrong_tesseract',
+        'pytesseract.pytesseract.tesseract_cmd',
+        'wrong_tesseract',
     )
     monkeypatch.setattr('sys.argv', ['', test_file])
-    with pytest.raises(SystemExit):
-        pytesseract.pytesseract.main()
-    assert capsys.readouterr().err.endswith(
-        "is not installed or it's not in your PATH\n",
+    assert pytesseract.pytesseract.main() == 1
+    assert (
+        "is not installed or it's not in your PATH" in capsys.readouterr().err
     )
 
+    monkeypatch.setattr('sys.argv', [''])
+    assert pytesseract.pytesseract.main() == 2
+    assert 'Usage: pytesseract [-l lang] input_file' in capsys.readouterr().err
+
 
 @pytest.mark.parametrize(
     'test_path',
@@ -328,9 +391,40 @@
     import pytesseract
 
     monkeypatch.setattr(
-        'pytesseract.pytesseract.tesseract_cmd', test_path,
+        'pytesseract.pytesseract.tesseract_cmd',
+        test_path,
     )
+
     with pytest.raises(
         TesseractNotFoundError if IS_PYTHON_2 and test_path else OSError,
     ):
-        pytesseract.pytesseract.image_to_string(test_file)
+        pytesseract.image_to_string(test_file)
+
+
+DEFAULT_LANGUAGES = ('fra', 'eng', 'osd')
+
+
[email protected](
+    'test_config,expected',
+    [
+        ('', DEFAULT_LANGUAGES),
+        ('--tessdata-dir {}/'.format(TESSDATA_DIR), ('dzo_test', 'eng')),
+        ('--tessdata-dir /dev/null', ()),
+        ('--tessdata-dir invalid_path/', ()),
+        ('--tessdata-dir=invalid_config/', DEFAULT_LANGUAGES),
+    ],
+    ids=[
+        'default_empty_config',
+        'custom_tessdata_dir',
+        'incorrect_tessdata_dir',
+        'invalid_tessdata_dir',
+        'invalid_config',
+    ],
+)
+def test_get_languages(test_config, expected):
+    result = get_languages.__wrapped__(test_config)
+    if not result:
+        assert result == []
+
+    for lang in expected:
+        assert lang in result
Binary files old/pytesseract-0.3.4/tests/tessdata/dzo_test.traineddata and 
new/pytesseract-0.3.7/tests/tessdata/dzo_test.traineddata differ
Binary files old/pytesseract-0.3.4/tests/tessdata/eng.traineddata and 
new/pytesseract-0.3.7/tests/tessdata/eng.traineddata differ
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/pytesseract-0.3.4/tox.ini 
new/pytesseract-0.3.7/tox.ini
--- old/pytesseract-0.3.4/tox.ini       2020-04-18 16:27:16.000000000 +0200
+++ new/pytesseract-0.3.7/tox.ini       2020-11-20 08:37:05.000000000 +0100
@@ -1,14 +1,13 @@
 [tox]
 envlist =
-    py27
-    py35
     py36
     py37
-    py38-pre-commit
+    py38
+    py39
 skip_missing_interpreters = true
 
 [pytest]
-addopts = --strict-markers --verbose --cache-clear -p no:doctest
+addopts = --strict-markers --verbose --cache-clear --color=yes -p no:doctest
 markers =
     pytesseract: Requires commandline pytesseract installed.
     lang_fra: Requires French (fra) pytesseract language.
@@ -19,11 +18,10 @@
 commands =
     python -bb -m pytest
 
-[testenv:py38-pre-commit]
+[testenv:py39]
 deps =
     numpy
     pandas
     -r{toxinidir}/requirements-dev.txt
 commands =
-    pre-commit run --all-files --show-diff-on-failure
-    python -bb -m pytest
+    python -bb -m pytest {posargs:tests}

commit python-pytesseract for openSUSE:Factory

Reply via email to