Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package python-tesserocr for
openSUSE:Factory checked in at 2021-06-24 18:22:30
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-tesserocr (Old)
and /work/SRC/openSUSE:Factory/.python-tesserocr.new.2625 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-tesserocr"
Thu Jun 24 18:22:30 2021 rev:11 rq:901578 version:2.5.2
Changes:
--------
--- /work/SRC/openSUSE:Factory/python-tesserocr/python-tesserocr.changes
2020-03-26 23:35:05.210814312 +0100
+++
/work/SRC/openSUSE:Factory/.python-tesserocr.new.2625/python-tesserocr.changes
2021-06-24 18:22:47.696930910 +0200
@@ -1,0 +2,9 @@
+Wed Jun 23 17:43:23 UTC 2021 - Mia Herkt <[email protected]>
+
+- Update to 2.5.2
+ * Support new Tesseract 5 API (gh#sirfz/tesserocr#242)
+ * GetBestLSTMSymbolChoices crash fix (gh#sirfz/tesserocr#241)
+ * Fallback to BMP instead of PNG
+ * Create pix from a BMP image bytes (gh#sirfz/tesserocr#156)
+
+-------------------------------------------------------------------
Old:
----
tesserocr-2.5.1.tar.gz
New:
----
tesserocr-2.5.2.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-tesserocr.spec ++++++
--- /var/tmp/diff_new_pack.LSisZN/_old 2021-06-24 18:22:48.228931500 +0200
+++ /var/tmp/diff_new_pack.LSisZN/_new 2021-06-24 18:22:48.232931505 +0200
@@ -1,7 +1,7 @@
#
# spec file for package python-tesserocr
#
-# Copyright (c) 2020 SUSE LLC
+# Copyright (c) 2021 SUSE LLC
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
@@ -18,7 +18,7 @@
%{?!python_module:%define python_module() python-%{**} python3-%{**}}
Name: python-tesserocr
-Version: 2.5.1
+Version: 2.5.2
Release: 0
Summary: A Python wrapper around tesseract-ocr
License: MIT
++++++ tesserocr-2.5.1.tar.gz -> tesserocr-2.5.2.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/MANIFEST.in
new/tesserocr-2.5.2/MANIFEST.in
--- old/tesserocr-2.5.1/MANIFEST.in 2019-11-08 23:49:38.000000000 +0100
+++ new/tesserocr-2.5.2/MANIFEST.in 2021-06-19 22:02:07.000000000 +0200
@@ -1,5 +1,5 @@
include README.rst
include LICENSE
include *.pyx *.pxd
-include tests/*.py tests/*.tif
+include tests/*.py tests/*.png
exclude *.cpp *.so
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/PKG-INFO new/tesserocr-2.5.2/PKG-INFO
--- old/tesserocr-2.5.1/PKG-INFO 2020-03-17 18:41:39.000000000 +0100
+++ new/tesserocr-2.5.2/PKG-INFO 2021-06-19 23:08:30.000000000 +0200
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: tesserocr
-Version: 2.5.1
+Version: 2.5.2
Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API
using Cython
Home-page: https://github.com/sirfz/tesserocr
Author: Fayez Zouheiry
@@ -108,6 +108,29 @@
> pip install <package_name>.whl
+ Build from source
+ `````````````````
+
+ If you need Windows tessocr package and your Python version is not
supported by above mentioned project,
+ you can try to follow `step by step instructions for Windows 64bit` in
`Windows.build.md`_.
+
+ .. _Windows.build.md: Windows.build.md
+
+ tessdata
+ ========
+
+ You may need to point to the tessdata path if it cannot be detected
automatically. This can be done by setting the ``TESSDATA_PREFIX`` environment
variable or by passing the path to ``PyTessBaseAPI`` (e.g.:
``PyTessBaseAPI(path='/usr/share/tessdata')``). The path should contain
``.traineddata`` files which can be found at
https://github.com/tesseract-ocr/tessdata.
+
+ Make sure you have the correct version of traineddata for your
``tesseract --version``.
+
+ You can list the current supported languages on your system using the
``get_languages`` function:
+
+ .. code:: python
+
+ from tesserocr import get_languages
+
+ print(get_languages('/usr/share/tessdata')) # or any other path
that applies to your system
+
Usage
=====
@@ -268,6 +291,8 @@
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Programming Language :: Cython
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/README.rst
new/tesserocr-2.5.2/README.rst
--- old/tesserocr-2.5.1/README.rst 2019-11-09 00:11:39.000000000 +0100
+++ new/tesserocr-2.5.2/README.rst 2021-06-19 23:05:59.000000000 +0200
@@ -100,6 +100,29 @@
> pip install <package_name>.whl
+Build from source
+`````````````````
+
+If you need Windows tessocr package and your Python version is not supported
by above mentioned project,
+you can try to follow `step by step instructions for Windows 64bit` in
`Windows.build.md`_.
+
+.. _Windows.build.md: Windows.build.md
+
+tessdata
+========
+
+You may need to point to the tessdata path if it cannot be detected
automatically. This can be done by setting the ``TESSDATA_PREFIX`` environment
variable or by passing the path to ``PyTessBaseAPI`` (e.g.:
``PyTessBaseAPI(path='/usr/share/tessdata')``). The path should contain
``.traineddata`` files which can be found at
https://github.com/tesseract-ocr/tessdata.
+
+Make sure you have the correct version of traineddata for your ``tesseract
--version``.
+
+You can list the current supported languages on your system using the
``get_languages`` function:
+
+.. code:: python
+
+ from tesserocr import get_languages
+
+ print(get_languages('/usr/share/tessdata')) # or any other path that
applies to your system
+
Usage
=====
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/setup.py new/tesserocr-2.5.2/setup.py
--- old/tesserocr-2.5.1/setup.py 2020-03-17 18:39:50.000000000 +0100
+++ new/tesserocr-2.5.2/setup.py 2021-06-19 22:02:07.000000000 +0200
@@ -1,17 +1,20 @@
+import codecs
+import errno
+import glob
+import itertools
import logging
import os
-import sys
-import codecs
import re
import subprocess
-import errno
-from os.path import dirname, abspath
-from os.path import split as psplit, join as pjoin
+import sys
+from os.path import abspath, dirname
+from os.path import join as pjoin
+from os.path import split as psplit
+
from setuptools import setup
from setuptools.command.build_ext import build_ext
from setuptools.extension import Extension
-
_LOGGER = logging.getLogger()
if os.environ.get('DEBUG'):
_LOGGER.setLevel(logging.DEBUG)
@@ -25,6 +28,11 @@
# find_version from pip https://github.com/pypa/pip/blob/1.5.6/setup.py#L33
here = abspath(dirname(__file__))
+EXTRA_COMPILE_ARGS = {
+ 'msvc': ['/std:c11', '-DUSE_STD_NAMESPACE'],
+ 'gcc': ['-std=c++11', '-DUSE_STD_NAMESPACE'],
+}
+
def read(*parts):
return codecs.open(pjoin(here, *parts), 'r').read()
@@ -32,11 +40,10 @@
def find_version(*file_paths):
version_file = read(*file_paths)
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
- version_file, re.M)
+ version_match = re.search('^__version__ = [\'"]([^\'"]*)[\'"]',
version_file, re.M)
if version_match:
return version_match.group(1)
- raise RuntimeError("Unable to find version string.")
+ raise RuntimeError('Unable to find version string.')
if sys.version_info >= (3, 0):
@@ -47,97 +54,186 @@
return s
+def major_version(version):
+ versions = version.split('.')
+ major = int(versions[0])
+ _LOGGER.info('Tesseract major version %s', major)
+ return major
+
+
def version_to_int(version):
subversion = None
subtrahend = 0
- # Subtracts a certain amount from the version number to differentiate
between
- # alpha, beta and release versions.
- if "alpha" in version:
- version_split = version.split("alpha")
+ # Subtracts a certain amount from the version number to differentiate
+ # between alpha, beta and release versions.
+ if 'alpha' in version:
+ version_split = version.split('alpha')
subversion = version_split[1]
subtrahend = 2
- elif "beta" in version:
- version_split = version.split("beta")
+ elif 'beta' in version:
+ version_split = version.split('beta')
subversion = version_split[1]
subtrahend = 1
+
version = re.search(r'((?:\d+\.)+\d+)', version).group()
- # Split the groups on ".", take only the first one, and print each group
with leading 0 if needed
- # To be safe, also handle cases where an extra group is added to the
version string, or if one or two groups
- # are dropped.
+ # Split the groups on ".", take only the first one, and print each
+ # group with leading 0 if needed. To be safe, also handle cases where
+ # an extra group is added to the version string, or if one or two
+ # groups are dropped.
version_groups = (version.split('.') + [0, 0])[:3]
- version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
- version_str = str((int(version_str, 10)-subtrahend))
+ version_str = '{:02}{:02}{:02}'.format(*map(int, version_groups))
+ version_str = str((int(version_str, 10) - subtrahend))
# Adds a 2 digit subversion number for the subversionrelease.
- subversion_str="00"
- if subversion is not None and subversion is not "":
+ subversion_str = '00'
+ if subversion is not None and subversion != '':
subversion = re.search(r'(?:\d+)', subversion).group()
subversion_groups = (subversion.split('-') + [0, 0])[:1]
- subversion_str = "{:02}".format(*map(int, subversion_groups))
- version_str+=subversion_str
+ subversion_str = '{:02}'.format(*map(int, subversion_groups))
+
+ version_str += subversion_str
return int(version_str, 16)
def package_config():
"""Use pkg-config to get library build parameters and tesseract version."""
- p = subprocess.Popen(['pkg-config', '--exists',
'--atleast-version={}'.format(_TESSERACT_MIN_VERSION),
- '--print-errors', 'tesseract'],
- stderr=subprocess.PIPE)
+ p = subprocess.Popen(
+ [
+ 'pkg-config',
+ '--exists',
+ '--atleast-version={}'.format(_TESSERACT_MIN_VERSION),
+ '--print-errors',
+ 'tesseract',
+ ],
+ stderr=subprocess.PIPE,
+ )
_, error = p.communicate()
if p.returncode != 0:
+ if isinstance(error, bytes):
+ error = error.decode()
+
raise Exception(error)
- p = subprocess.Popen(['pkg-config', '--libs', '--cflags', 'tesseract'],
stdout=subprocess.PIPE)
+
+ p = subprocess.Popen(
+ ['pkg-config', '--libs', '--cflags', 'tesseract'],
stdout=subprocess.PIPE
+ )
output, _ = p.communicate()
flags = _read_string(output).strip().split()
- p = subprocess.Popen(['pkg-config', '--libs', '--cflags', 'lept'],
stdout=subprocess.PIPE)
+ p = subprocess.Popen(
+ ['pkg-config', '--libs', '--cflags', 'lept'], stdout=subprocess.PIPE
+ )
output, _ = p.communicate()
flags2 = _read_string(output).strip().split()
- options = {'-L': 'library_dirs',
- '-I': 'include_dirs',
- '-l': 'libraries'}
- config = {'library_dirs': [],
- 'include_dirs': [],
- 'libraries': []}
- import itertools
+ options = {'-L': 'library_dirs', '-I': 'include_dirs', '-l': 'libraries'}
+ config = {'library_dirs': [], 'include_dirs': [], 'libraries': []}
+
for f in itertools.chain(flags, flags2):
try:
opt = options[f[:2]]
except KeyError:
continue
val = f[2:]
- if opt == 'include_dirs' and psplit(val)[1].strip(os.sep) in
('leptonica', 'tesseract'):
+ if opt == 'include_dirs' and psplit(val)[1].strip(os.sep) in (
+ 'leptonica',
+ 'tesseract',
+ ):
val = dirname(val)
config[opt] += [val]
- p = subprocess.Popen(['pkg-config', '--modversion', 'tesseract'],
stdout=subprocess.PIPE)
+
+ p = subprocess.Popen(
+ ['pkg-config', '--modversion', 'tesseract'], stdout=subprocess.PIPE
+ )
version, _ = p.communicate()
version = _read_string(version).strip()
- _LOGGER.info("Supporting tesseract v{}".format(version))
- config['cython_compile_time_env'] = {'TESSERACT_VERSION':
version_to_int(version)}
- _LOGGER.info("Configs from pkg-config: {}".format(config))
+ _LOGGER.info('Supporting tesseract v%s', version)
+ config['compile_time_env'] = {
+ 'TESSERACT_MAJOR_VERSION': major_version(version),
+ 'TESSERACT_VERSION': version_to_int(version)
+ }
+ _LOGGER.info('Configs from pkg-config: %s', config)
return config
+def find_library(pattern, path_list, version=''):
+ """Help routine to find library."""
+ result = []
+ for path in path_list:
+ filepattern = os.path.join(path, pattern)
+ result += glob.glob(filepattern)
+ # ignore debug library
+ result = [i for i in result if not i.endswith('d.lib')]
+ if version:
+ result = [i for i in result if version in i]
+ return result
+
+
def get_tesseract_version():
"""Try to extract version from tesseract otherwise default min version."""
config = {'libraries': ['tesseract', 'lept']}
try:
- p = subprocess.Popen(['tesseract', '-v'], stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
+ p = subprocess.Popen(
+ ['tesseract', '-v'], stderr=subprocess.PIPE, stdout=subprocess.PIPE
+ )
stdout_version, version = p.communicate()
version = _read_string(version).strip()
if version == '':
version = _read_string(stdout_version).strip()
+
version_match = re.search(r'^tesseract ((?:\d+\.)+\d+).*', version,
re.M)
if version_match:
version = version_match.group(1)
else:
- _LOGGER.warn('Failed to extract tesseract version number from:
{}'.format(version))
+ _LOGGER.warning(
+ 'Failed to extract tesseract version number from: %s', version
+ )
version = _TESSERACT_MIN_VERSION
except OSError as e:
- _LOGGER.warn('Failed to extract tesseract version from executable:
{}'.format(e))
+ _LOGGER.warning('Failed to extract tesseract version from executable:
%s', e)
version = _TESSERACT_MIN_VERSION
- _LOGGER.info("Supporting tesseract v{}".format(version))
- version = version_to_int(version)
- config['cython_compile_time_env'] = {'TESSERACT_VERSION': version}
- _LOGGER.info("Building with configs: {}".format(config))
+
+ _LOGGER.info('Supporting tesseract v%s', version)
+ config['compile_time_env'] = {
+ 'TESSERACT_MAJOR_VERSION': major_version(version),
+ 'TESSERACT_VERSION': version_to_int(version)
+ }
+ if sys.platform == 'win32':
+ libpaths = os.getenv('LIBPATH', None)
+ if libpaths:
+ libpaths = list(filter(None, libpaths.split(';')))
+ else:
+ libpaths = []
+
+ if version:
+ lib_version = ''.join(version.split('.')[:2])
+ else:
+ lib_version = None
+
+ tess_lib = find_library('tesseract*.lib', libpaths, lib_version)
+ if len(tess_lib) >= 1:
+ base = os.path.basename(sorted(tess_lib, reverse=True)[0])
+ tess_lib = os.path.splitext(base)[0]
+ else:
+ error = 'Tesseract library not found in LIBPATH:
{}'.format(libpaths)
+ raise RuntimeError(error)
+
+ lept_lib = find_library('lept*.lib', libpaths)
+ if len(lept_lib) >= 1:
+ base = os.path.basename(sorted(lept_lib, reverse=True)[0])
+ lept_lib = os.path.splitext(base)[0]
+ else:
+ error = 'Leptonica library not found in LIBPATH:
{}'.format(libpaths)
+ raise RuntimeError(error)
+
+ includepaths = os.getenv('INCLUDE', None)
+ if includepaths:
+ includepaths = list(filter(None, includepaths.split(';')))
+ else:
+ includepaths = []
+
+ config['libraries'] = [tess_lib, lept_lib]
+ config['library_dirs'] = libpaths
+ config['include_dirs'] = includepaths
+
+ _LOGGER.info('Building with configs: %s', config)
return config
@@ -148,64 +244,87 @@
except Exception as e:
if isinstance(e, OSError):
if e.errno != errno.ENOENT:
- _LOGGER.warn('Failed to run pkg-config: {}'.format(e))
+ _LOGGER.warning('Failed to run pkg-config: %s', e)
else:
- _LOGGER.warn('pkg-config failed to find tesseract/lept libraries:
{}'.format(e))
+ _LOGGER.warning(
+ 'pkg-config failed to find tesseract/leptonica libraries: %s',
e
+ )
build_args = get_tesseract_version()
- if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x3050200:
- _LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler support')
- build_args['extra_compile_args'] = ['-std=c++11',
'-DUSE_STD_NAMESPACE']
-
- _LOGGER.debug('build parameters: {}'.format(build_args))
+ _LOGGER.debug('build parameters: %s', build_args)
return build_args
def make_extension():
global _CYTHON_COMPILE_TIME_ENV
build_args = get_build_args()
- _CYTHON_COMPILE_TIME_ENV = build_args.pop('cython_compile_time_env')
- return Extension("tesserocr", sources=["tesserocr.pyx"], language="c++",
**build_args)
+ _CYTHON_COMPILE_TIME_ENV = build_args.pop('compile_time_env')
+ return Extension(
+ 'tesserocr', sources=['tesserocr.pyx'], language='c++', **build_args
+ )
class my_build_ext(build_ext, object):
+ def build_extensions(self):
+ compiler = self.compiler.compiler_type
+ _LOGGER.info('Detected compiler: %s', compiler)
+ extra_args = EXTRA_COMPILE_ARGS.get(compiler,
EXTRA_COMPILE_ARGS['gcc'])
+ if isinstance(_CYTHON_COMPILE_TIME_ENV, dict):
+ version = _CYTHON_COMPILE_TIME_ENV.get('TESSERACT_VERSION', 0)
+ else:
+ version = 0
+
+ for extension in self.extensions:
+ if version >= 0x3050200:
+ _LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler
support')
+ extension.extra_compile_args = extra_args
+
+ build_ext.build_extensions(self)
+
def finalize_options(self):
from Cython.Build.Dependencies import cythonize
+
self.distribution.ext_modules[:] = cythonize(
- self.distribution.ext_modules,
compile_time_env=_CYTHON_COMPILE_TIME_ENV)
+ self.distribution.ext_modules,
compile_time_env=_CYTHON_COMPILE_TIME_ENV
+ )
super(my_build_ext, self).finalize_options()
-setup(name='tesserocr',
- version=find_version('tesserocr.pyx'),
- description='A simple, Pillow-friendly, Python wrapper around
tesseract-ocr API using Cython',
- long_description=read('README.rst'),
- long_description_content_type='text/x-rst',
- url='https://github.com/sirfz/tesserocr',
- author='Fayez Zouheiry',
- author_email='[email protected]',
- license='MIT',
- classifiers=[
- 'Development Status :: 5 - Production/Stable',
- 'Intended Audience :: Developers',
- 'Topic :: Multimedia :: Graphics :: Capture :: Scanners',
- 'Topic :: Multimedia :: Graphics :: Graphics Conversion',
- 'Topic :: Scientific/Engineering :: Image Recognition',
- 'License :: OSI Approved :: MIT License',
- 'Operating System :: POSIX',
- 'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.4',
- 'Programming Language :: Python :: 3.5',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: Implementation :: CPython',
- 'Programming Language :: Python :: Implementation :: PyPy',
- 'Programming Language :: Cython'
- ],
- keywords='Tesseract,tesseract-ocr,OCR,optical character
recognition,PIL,Pillow,Cython',
- cmdclass={'build_ext': my_build_ext},
- ext_modules=[make_extension()],
- test_suite='tests',
- setup_requires=['Cython>=0.23'],
- )
+setup(
+ name='tesserocr',
+ version=find_version('tesserocr.pyx'),
+ description='A simple, Pillow-friendly, Python wrapper around '
+ 'tesseract-ocr API using Cython',
+ long_description=read('README.rst'),
+ long_description_content_type='text/x-rst',
+ url='https://github.com/sirfz/tesserocr',
+ author='Fayez Zouheiry',
+ author_email='[email protected]',
+ license='MIT',
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: Developers',
+ 'Topic :: Multimedia :: Graphics :: Capture :: Scanners',
+ 'Topic :: Multimedia :: Graphics :: Graphics Conversion',
+ 'Topic :: Scientific/Engineering :: Image Recognition',
+ 'License :: OSI Approved :: MIT License',
+ 'Operating System :: POSIX',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: Python :: Implementation :: CPython',
+ 'Programming Language :: Python :: Implementation :: PyPy',
+ 'Programming Language :: Cython',
+ ],
+ keywords='Tesseract,tesseract-ocr,OCR,optical character recognition,'
+ 'PIL,Pillow,Cython',
+ cmdclass={'build_ext': my_build_ext},
+ ext_modules=[make_extension()],
+ test_suite='tests',
+ setup_requires=['Cython>=0.23'],
+)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/tesseract.pxd
new/tesserocr-2.5.2/tesseract.pxd
--- old/tesserocr-2.5.1/tesseract.pxd 2019-11-08 23:49:38.000000000 +0100
+++ new/tesserocr-2.5.2/tesseract.pxd 2021-06-19 22:09:33.000000000 +0200
@@ -33,6 +33,7 @@
char *getLeptonicaVersion()
Pix *pixRead(cchar_t *)
Pix *pixReadMem(cuchar_t *, size_t)
+ Pix *pixReadMemBmp(cuchar_t *, size_t)
int pixWriteMemJpeg(unsigned char **, size_t *, Pix *, int, int)
int pixWriteMem(unsigned char **, size_t *, Pix *, int)
void pixDestroy(Pix **)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/tesseract5.pxd
new/tesserocr-2.5.2/tesseract5.pxd
--- old/tesserocr-2.5.1/tesseract5.pxd 1970-01-01 01:00:00.000000000 +0100
+++ new/tesserocr-2.5.2/tesseract5.pxd 2021-06-19 22:53:48.000000000 +0200
@@ -0,0 +1,314 @@
+from libcpp cimport bool
+from libcpp.pair cimport pair
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+ctypedef const char cchar_t
+ctypedef const char * cchar_tp
+ctypedef const unsigned char cuchar_t
+
+cdef extern from "leptonica/allheaders.h" nogil:
+ struct Pix:
+ int informat
+
+ struct Box:
+ int x
+ int y
+ int w
+ int h
+
+ struct Boxa:
+ int n # number of box in ptr array
+ Box **box # box ptr array
+
+ struct Pixa:
+ int n # number of Pix in ptr array
+ Pix **pix # the array of ptrs to pix
+ Boxa *boxa # array of boxes
+
+ struct Pta:
+ int n # actual number of pts
+ float *x
+ float *y # arrays of floats
+
+ char *getImagelibVersions()
+ char *getLeptonicaVersion()
+ Pix *pixRead(cchar_t *)
+ Pix *pixReadMem(cuchar_t *, size_t)
+ Pix *pixReadMemBmp(cuchar_t *, size_t)
+ int pixWriteMemJpeg(unsigned char **, size_t *, Pix *, int, int)
+ int pixWriteMem(unsigned char **, size_t *, Pix *, int)
+ void pixDestroy(Pix **)
+ void ptaDestroy(Pta **)
+ int setMsgSeverity(int)
+ void pixaDestroy(Pixa **)
+ void boxaDestroy(Boxa **)
+
+ cdef enum:
+ L_SEVERITY_EXTERNAL = 0 # Get the severity from the environment
+ L_SEVERITY_ALL = 1 # Lowest severity: print all messages
+ L_SEVERITY_DEBUG = 2 # Print debugging and higher messages
+ L_SEVERITY_INFO = 3 # Print informational and higher messages
+ L_SEVERITY_WARNING = 4 # Print warning and higher messages
+ L_SEVERITY_ERROR = 5 # Print error and higher messages
+ L_SEVERITY_NONE = 6 # Highest severity: print no messages
+
+cdef extern from "tesseract/publictypes.h" namespace "tesseract" nogil:
+ cdef enum PolyBlockType:
+ PT_UNKNOWN # Type is not yet known. Keep as the first element.
+ PT_FLOWING_TEXT # Text that lives inside a column.
+ PT_HEADING_TEXT # Text that spans more than one column.
+ PT_PULLOUT_TEXT # Text that is in a cross-column pull-out region.
+ PT_EQUATION # Partition belonging to an equation region.
+ PT_INLINE_EQUATION # Partition has inline equation.
+ PT_TABLE # Partition belonging to a table region.
+ PT_VERTICAL_TEXT # Text-line runs vertically.
+ PT_CAPTION_TEXT # Text that belongs to an image.
+ PT_FLOWING_IMAGE # Image that lives inside a column.
+ PT_HEADING_IMAGE # Image that spans more than one column.
+ PT_PULLOUT_IMAGE # Image that is in a cross-column pull-out region.
+ PT_HORZ_LINE # Horizontal Line.
+ PT_VERT_LINE # Vertical Line.
+ PT_NOISE # Lies outside of any column.
+ PT_COUNT
+
+cdef extern from "tesseract/publictypes.h" namespace "tesseract" nogil:
+
+ cdef enum TessOrientation "tesseract::Orientation":
+ ORIENTATION_PAGE_UP
+ ORIENTATION_PAGE_RIGHT
+ ORIENTATION_PAGE_DOWN
+ ORIENTATION_PAGE_LEFT
+
+ cdef enum TessWritingDirection "tesseract::WritingDirection":
+ WRITING_DIRECTION_LEFT_TO_RIGHT
+ WRITING_DIRECTION_RIGHT_TO_LEFT
+ WRITING_DIRECTION_TOP_TO_BOTTOM
+
+ cdef enum TessTextlineOrder "tesseract::TextlineOrder":
+ TEXTLINE_ORDER_LEFT_TO_RIGHT
+ TEXTLINE_ORDER_RIGHT_TO_LEFT
+ TEXTLINE_ORDER_TOP_TO_BOTTOM
+
+ cdef enum TessParagraphJustification "tesseract::ParagraphJustification":
+ JUSTIFICATION_UNKNOWN
+ JUSTIFICATION_LEFT
+ JUSTIFICATION_CENTER
+ JUSTIFICATION_RIGHT
+
+cdef extern from "tesseract/unichar.h" namespace "tesseract" nogil:
+ cdef enum StrongScriptDirection:
+ DIR_NEUTRAL # Text contains only neutral characters.
+ DIR_LEFT_TO_RIGHT # Text contains no Right-to-Left characters.
+ DIR_RIGHT_TO_LEFT # Text contains no Left-to-Right characters.
+ DIR_MIX # Text contains a mixture of left-to-right
+ # and right-to-left characters.
+
+cdef extern from "tesseract/ocrclass.h" namespace "tesseract" nogil:
+ ctypedef bool (*CANCEL_FUNC)(void *, int)
+ cdef cppclass ETEXT_DESC:
+ ETEXT_DESC() except +
+ CANCEL_FUNC cancel # returns true to cancel
+ void *cancel_this # this or other data for cancel
+ void set_deadline_msecs(int)
+
+cdef extern from "tesseract/pageiterator.h" namespace "tesseract" nogil:
+ cdef cppclass PageIterator:
+ void Begin()
+ void RestartParagraph()
+ bool IsWithinFirstTextlineOfParagraph() const
+ void RestartRow()
+ bool Next(PageIteratorLevel)
+ bool IsAtBeginningOf(PageIteratorLevel) const
+ bool IsAtFinalElement(PageIteratorLevel, PageIteratorLevel) const
+ void SetBoundingBoxComponents(bool, bool)
+ bool BoundingBox(PageIteratorLevel, const int, int *, int *, int *,
int *) const
+ bool BoundingBoxInternal(PageIteratorLevel, int *, int *, int *, int
*) const
+ bool Empty(PageIteratorLevel) const
+ PolyBlockType BlockType() const
+ Pta *BlockPolygon() const
+ Pix *GetBinaryImage(PageIteratorLevel) const
+ Pix *GetImage(PageIteratorLevel, int, Pix *, int *, int *) const
+ bool Baseline(PageIteratorLevel, int *, int *, int *, int *) const
+ void Orientation(TessOrientation *, TessWritingDirection *,
TessTextlineOrder *, float *) const
+ void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int
*) const
+
+cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil:
+ cdef cppclass LTRResultIterator(PageIterator):
+ char *GetUTF8Text(PageIteratorLevel) const
+ void SetLineSeparator(cchar_t *)
+ void SetParagraphSeparator(cchar_t *)
+ float Confidence(PageIteratorLevel) const
+ void RowAttributes(float *, float *, float *) const
+ cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool
*, bool *, int *, int *) const
+ cchar_t *WordRecognitionLanguage() const
+ StrongScriptDirection WordDirection() const
+ bool WordIsFromDictionary() const
+ int BlanksBeforeWord() const
+ bool WordIsNumeric() const
+ bool HasBlamerInfo() const
+ cchar_t *GetBlamerDebug() const
+ cchar_t *GetBlamerMisadaptionDebug() const
+ bool HasTruthString() const
+ bool EquivalentToTruth(cchar_t *) const
+ char *WordTruthUTF8Text() const
+ char *WordNormedUTF8Text() const
+ cchar_t *WordLattice(int *) const
+ bool SymbolIsSuperscript() const
+ bool SymbolIsSubscript() const
+ bool SymbolIsDropcap() const
+
+ cdef cppclass ChoiceIterator:
+ ChoiceIterator(const LTRResultIterator &) except +
+ bool Next()
+ cchar_t *GetUTF8Text() const
+ float Confidence() const
+
+cdef extern from "tesseract/resultiterator.h" namespace "tesseract" nogil:
+ cdef cppclass ResultIterator(LTRResultIterator):
+ bool ParagraphIsLtr() const
+ vector[vector[pair[cchar_tp, float]]] *GetBestLSTMSymbolChoices() const
+
+cdef extern from "tesseract/renderer.h" namespace "tesseract" nogil:
+ cdef cppclass TessResultRenderer:
+ void insert(TessResultRenderer *)
+
+ cdef cppclass TessTextRenderer(TessResultRenderer):
+ TessTextRenderer(cchar_t *) except +
+
+ cdef cppclass TessHOcrRenderer(TessResultRenderer):
+ TessHOcrRenderer(cchar_t *, bool) except +
+
+ cdef cppclass TessPDFRenderer(TessResultRenderer):
+ TessPDFRenderer(cchar_t *, cchar_t *, bool) except +
+
+ cdef cppclass TessUnlvRenderer(TessResultRenderer):
+ TessUnlvRenderer(cchar_t *) except +
+
+ cdef cppclass TessBoxTextRenderer(TessResultRenderer):
+ TessBoxTextRenderer(cchar_t *) except +
+
+ cdef cppclass TessOsdRenderer(TessResultRenderer):
+ TessOsdRenderer(cchar_t *) except +
+
+cdef extern from "tesseract/osdetect.h" namespace "tesseract" nogil:
+ struct OSBestResult:
+ int orientation_id
+ int script_id
+ float sconfidence
+ float oconfidence
+
+ ctypedef int (*get_best_script)(int)
+
+ struct OSResults:
+ get_best_script get_best_script
+ OSBestResult best_result
+
+cdef extern from "tesseract/baseapi.h" namespace "tesseract" nogil:
+
+ cdef enum OcrEngineMode:
+ OEM_TESSERACT_ONLY
+ OEM_LSTM_ONLY
+ OEM_TESSERACT_LSTM_COMBINED
+ OEM_DEFAULT
+
+ cdef enum PageSegMode:
+ PSM_OSD_ONLY, # Orientation and script detection only.
+ PSM_AUTO_OSD, # Automatic page segmentation with
orientation and
+ # script detection. (OSD)
+ PSM_AUTO_ONLY, # Automatic page segmentation, but no
OSD, or OCR.
+ PSM_AUTO, # Fully automatic page segmentation, but
no OSD.
+ PSM_SINGLE_COLUMN, # Assume a single column of text of
variable sizes.
+ PSM_SINGLE_BLOCK_VERT_TEXT, # Assume a single uniform block of
vertically
+ # aligned text.
+ PSM_SINGLE_BLOCK, # Assume a single uniform block of text.
(Default.)
+ PSM_SINGLE_LINE, # Treat the image as a single text line.
+ PSM_SINGLE_WORD, # Treat the image as a single word.
+ PSM_CIRCLE_WORD, # Treat the image as a single word in a
circle.
+ PSM_SINGLE_CHAR, # Treat the image as a single character.
+ PSM_SPARSE_TEXT, # Find as much text as possible in no
particular order.
+ PSM_SPARSE_TEXT_OSD, # Sparse text with orientation and script
det.
+ PSM_RAW_LINE, # Treat the image as a single text line,
bypassing
+ # hacks that are Tesseract-specific.
+ PSM_COUNT # Number of enum entries.
+
+ cdef enum PageIteratorLevel:
+ RIL_BLOCK, # of text/image/separator line.
+ RIL_PARA, # within a block.
+ RIL_TEXTLINE, # within a paragraph.
+ RIL_WORD, # within a textline.
+ RIL_SYMBOL # character within a word.
+
+ cdef cppclass TessBaseAPI:
+ TessBaseAPI() except +
+ @staticmethod
+ cchar_t *Version()
+ @staticmethod
+ void ClearPersistentCache()
+ void SetInputName(cchar_t *)
+ cchar_t *GetInputName()
+ void SetInputImage(Pix *)
+ Pix *GetInputImage()
+ int GetSourceYResolution()
+ cchar_t *GetDatapath()
+ void SetOutputName(cchar_t *)
+ bool SetVariable(cchar_t *, cchar_t *)
+ bool SetDebugVariable(cchar_t *, cchar_t *)
+ bool GetIntVariable(cchar_t *, int *) const
+ bool GetBoolVariable(cchar_t *, bool *) const
+ bool GetDoubleVariable(cchar_t *, double *) const
+ cchar_t *GetStringVariable(cchar_t *) const
+ bool GetVariableAsString(cchar_t *, string *)
+ int Init(cchar_t *, cchar_t *, OcrEngineMode mode,
+ char **, int,
+ const vector[string] *,
+ const vector[string] *,
+ bool)
+ int Init(cchar_t *, cchar_t *, OcrEngineMode)
+ int Init(cchar_t *, cchar_t *)
+ cchar_t *GetInitLanguagesAsString() const
+ void GetLoadedLanguagesAsVector(vector[string] *) const
+ void GetAvailableLanguagesAsVector(vector[string] *) const
+ void InitForAnalysePage()
+ void ReadConfigFile(cchar_t *)
+ void SetPageSegMode(PageSegMode)
+ PageSegMode GetPageSegMode() const
+ char *TesseractRect(cuchar_t *, int, int, int, int, int, int)
+ void ClearAdaptiveClassifier()
+ void SetImage(cuchar_t *, int, int, int, int)
+ void SetImage(Pix *)
+ void SetSourceResolution(int)
+ void SetRectangle(int, int, int, int)
+ Pix *GetThresholdedImage()
+ Boxa *GetRegions(Pixa **)
+ Boxa *GetTextlines(const bool, const int, Pixa **, int **, int **)
+ Boxa *GetStrips(Pixa **, int **)
+ Boxa *GetWords(Pixa **)
+ Boxa *GetConnectedComponents(Pixa **)
+ Boxa *GetComponentImages(const PageIteratorLevel,
+ const bool, const bool,
+ const int,
+ Pixa **, int **, int **)
+ int GetThresholdedImageScaleFactor() const
+ PageIterator *AnalyseLayout(bool)
+ int Recognize(ETEXT_DESC *)
+ bool ProcessPages(cchar_t *, cchar_t *, int, TessResultRenderer *)
+ bool ProcessPage(Pix *, int, cchar_t *, cchar_t *, int,
TessResultRenderer *)
+ ResultIterator *GetIterator()
+ char *GetUTF8Text()
+ char *GetHOCRText(int)
+ char *GetTSVText(int)
+ char *GetBoxText(int)
+ char *GetUNLVText()
+ bool DetectOrientationScript(int *, float *, cchar_t **, float *)
+ int MeanTextConf()
+ int *AllWordConfidences()
+ bool AdaptToWordStr(PageSegMode, cchar_t *)
+ void Clear()
+ void End()
+ int IsValidWord(cchar_t *)
+ bool IsValidCharacter(cchar_t *)
+ bool GetTextDirection(int *, float *)
+ bool DetectOS(OSResults *);
+ cchar_t *GetUnichar(int)
+ const OcrEngineMode oem() const
+ void set_min_orientation_margin(double)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/tesserocr.egg-info/PKG-INFO
new/tesserocr-2.5.2/tesserocr.egg-info/PKG-INFO
--- old/tesserocr-2.5.1/tesserocr.egg-info/PKG-INFO 2020-03-17
18:41:35.000000000 +0100
+++ new/tesserocr-2.5.2/tesserocr.egg-info/PKG-INFO 2021-06-19
23:08:29.000000000 +0200
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: tesserocr
-Version: 2.5.1
+Version: 2.5.2
Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API
using Cython
Home-page: https://github.com/sirfz/tesserocr
Author: Fayez Zouheiry
@@ -108,6 +108,29 @@
> pip install <package_name>.whl
+ Build from source
+ `````````````````
+
+ If you need Windows tessocr package and your Python version is not
supported by above mentioned project,
+ you can try to follow `step by step instructions for Windows 64bit` in
`Windows.build.md`_.
+
+ .. _Windows.build.md: Windows.build.md
+
+ tessdata
+ ========
+
+ You may need to point to the tessdata path if it cannot be detected
automatically. This can be done by setting the ``TESSDATA_PREFIX`` environment
variable or by passing the path to ``PyTessBaseAPI`` (e.g.:
``PyTessBaseAPI(path='/usr/share/tessdata')``). The path should contain
``.traineddata`` files which can be found at
https://github.com/tesseract-ocr/tessdata.
+
+ Make sure you have the correct version of traineddata for your
``tesseract --version``.
+
+ You can list the current supported languages on your system using the
``get_languages`` function:
+
+ .. code:: python
+
+ from tesserocr import get_languages
+
+ print(get_languages('/usr/share/tessdata')) # or any other path
that applies to your system
+
Usage
=====
@@ -268,6 +291,8 @@
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Programming Language :: Cython
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/tesserocr.egg-info/SOURCES.txt
new/tesserocr-2.5.2/tesserocr.egg-info/SOURCES.txt
--- old/tesserocr-2.5.1/tesserocr.egg-info/SOURCES.txt 2020-03-17
18:41:38.000000000 +0100
+++ new/tesserocr-2.5.2/tesserocr.egg-info/SOURCES.txt 2021-06-19
23:08:29.000000000 +0200
@@ -3,6 +3,7 @@
README.rst
setup.py
tesseract.pxd
+tesseract5.pxd
tesserocr.pyx
tesserocr_experiment.pyx
tesserocr.egg-info/PKG-INFO
@@ -10,5 +11,5 @@
tesserocr.egg-info/dependency_links.txt
tesserocr.egg-info/top_level.txt
tests/__init__.py
-tests/eurotext.tif
+tests/eurotext.png
tests/test_api.py
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/tesserocr.pyx
new/tesserocr-2.5.2/tesserocr.pyx
--- old/tesserocr-2.5.1/tesserocr.pyx 2020-03-17 18:40:03.000000000 +0100
+++ new/tesserocr-2.5.2/tesserocr.pyx 2021-06-19 22:09:33.000000000 +0200
@@ -1,5 +1,5 @@
#!python
-#cython: c_string_type=unicode, c_string_encoding=utf-8
+#cython: c_string_type=unicode, c_string_encoding=utf-8, language_level=3
"""Python wrapper around the Tesseract-OCR C++ API
This module provides a wrapper class :class:`PyTessBaseAPI` to call
@@ -18,7 +18,7 @@
['eng', 'osd', 'equ'])
"""
-__version__ = '2.5.1'
+__version__ = '2.5.2'
import os
from io import BytesIO
@@ -29,8 +29,14 @@
# PIL.Image won't be supported
pass
-from tesseract cimport *
+IF TESSERACT_MAJOR_VERSION < 5:
+ from tesseract cimport *
+ELSE:
+ from tesseract5 cimport *
from libc.stdlib cimport malloc, free
+from libcpp.pair cimport pair
+from libcpp.vector cimport vector
+from cython.operator cimport preincrement as inc, dereference as deref
from cpython.version cimport PY_MAJOR_VERSION
@@ -188,7 +194,7 @@
cdef class PT(_Enum):
- """An enum the defines avaialbe Poly Block types.
+ """An enum that defines available Poly Block types.
Attributes:
UNKNOWN: Type is not yet known. Keep as the first element.
@@ -323,7 +329,7 @@
cdef bytes _image_buffer(image):
"""Return raw bytes of a PIL Image"""
with BytesIO() as f:
- image.save(f, image.format or 'PNG')
+ image.save(f, image.format or 'BMP')
return f.getvalue()
@@ -337,8 +343,8 @@
if fmt > 0:
result = pixWriteMem(&buff, &size, pix, fmt)
else:
- # write as JPEG if format is unknown
- result = pixWriteMemJpeg(&buff, &size, pix, 0, 0)
+ # write as IFF_BMP if format is unknown
+ result = pixWriteMem(&buff, &size, pix, 1)
try:
if result == 1:
@@ -544,7 +550,7 @@
See comment on coordinate system above.
Args:
- level (int): Page Iteration Level. See :class:`RIL` for avaialbe
levels.
+ level (int): Page Iteration Level. See :class:`RIL` for available
levels.
Kwargs:
padding (int): The padding argument to :meth:`GetImage` can be
used to expand
@@ -568,7 +574,7 @@
respect to the original image and is scaled by a factor scale_.
Args:
- level (int): Page Iteration Level. See :class:`RIL` for avaialbe
levels.
+ level (int): Page Iteration Level. See :class:`RIL` for available
levels.
Returns:
tuple or None if there is no such object at the current position.
@@ -1046,13 +1052,31 @@
IF TESSERACT_VERSION >= 0x4000000:
def GetBestLSTMSymbolChoices(self):
+ """Returns the LSTM choices for every LSTM timestep for the
current word."""
+ cdef:
+ vector[vector[pair[cchar_tp, float]]] *output =
self._riter.GetBestLSTMSymbolChoices()
+ vector[vector[pair[cchar_tp, float]]].iterator it
+ vector[pair[cchar_tp, float]].iterator cit
+ vector[pair[cchar_tp, float]] configpairs
+ pair[cchar_tp, float] configpair
+
LSTMSymbolChoices = []
- output = self._riter.GetBestLSTMSymbolChoices()[0]
- for tstep in output:
+ if output == NULL:
+ return LSTMSymbolChoices
+
+ it = output.begin()
+ while it != output.end():
timestep = []
- for confpair in tstep:
- timestep.append((confpair.first, confpair.second))
+ configpairs = deref(it)
+ cit = configpairs.begin()
+ while cit != configpairs.end():
+ configpair = deref(cit)
+ timestep.append((configpair.first, configpair.second))
+ inc(cit)
+
LSTMSymbolChoices.append(timestep)
+ inc(it)
+
return LSTMSymbolChoices
@@ -1191,7 +1215,20 @@
def __dealloc__(self):
self._end_api()
- cdef int _init_api(self, cchar_t *path, cchar_t *lang,
+ IF TESSERACT_MAJOR_VERSION >= 5:
+ cdef int _init_api(self, cchar_t *path, cchar_t *lang,
+ OcrEngineMode oem, char **configs, int configs_size,
+ const vector[string] *vars_vec, const vector[string]
*vars_vals,
+ bool set_only_non_debug_params, PageSegMode psm) nogil
except -1:
+ cdef int ret = self._baseapi.Init(path, lang, oem, configs,
configs_size, vars_vec, vars_vals,
+ set_only_non_debug_params)
+ if ret == -1:
+ with gil:
+ raise RuntimeError('Failed to init API, possibly an invalid
tessdata path: {}'.format(path))
+ self._baseapi.SetPageSegMode(psm)
+ return ret
+ ELSE:
+ cdef int _init_api(self, cchar_t *path, cchar_t *lang,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector[STRING] *vars_vec, const
GenericVector[STRING] *vars_vals,
bool set_only_non_debug_params, PageSegMode psm) nogil
except -1:
@@ -1323,9 +1360,14 @@
Returns ``None`` if parameter was not found.
"""
- cdef:
- bytes py_name = _b(name)
- STRING val
+ IF TESSERACT_MAJOR_VERSION >= 5:
+ cdef:
+ bytes py_name = _b(name)
+ string val
+ ELSE:
+ cdef:
+ bytes py_name = _b(name)
+ STRING val
if self._baseapi.GetVariableAsString(py_name, &val):
return val.c_str()
return None
@@ -1356,7 +1398,7 @@
applicable language, and there is more chance of hallucinating
incorrect
words.
oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`.
- See :class:`OEM` for all avaialbe options.
+ See :class:`OEM` for all available options.
configs (list): List of config files to load variables from.
variables (dict): Extra variables to be set.
set_only_non_debug_params (bool): If ``True``, only params that do
not contain
@@ -1365,7 +1407,20 @@
Raises:
:exc:`RuntimeError`: If API initialization fails.
"""
- cdef:
+ IF TESSERACT_MAJOR_VERSION >= 5:
+ cdef:
+ bytes py_path = _b(path)
+ bytes py_lang = _b(lang)
+ cchar_t *cpath = py_path
+ cchar_t *clang = py_lang
+ int configs_size = len(configs)
+ char **configs_ = <char **>malloc(configs_size * sizeof(char *))
+ vector[string] vars_vec
+ vector[string] vars_vals
+ cchar_t *val
+ string sval
+ ELSE:
+ cdef:
bytes py_path = _b(path)
bytes py_lang = _b(lang)
cchar_t *cpath = py_path
@@ -1410,7 +1465,7 @@
lang (str): An ISO 639-3 language string. Defaults to 'eng'.
See :meth:`InitFull` for full description of this parameter.
oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`.
- See :class:`OEM` for all avaialbe options.
+ See :class:`OEM` for all available options.
Raises:
:exc:`RuntimeError`: If API initialization fails.
@@ -1439,15 +1494,23 @@
Includes all languages loaded by the last Init, including those loaded
as dependencies of other loaded languages.
"""
- cdef GenericVector[STRING] langs
+ IF TESSERACT_MAJOR_VERSION >= 5:
+ cdef vector[string] langs
+ ELSE:
+ cdef GenericVector[STRING] langs
self._baseapi.GetLoadedLanguagesAsVector(&langs)
return [langs[i].c_str() for i in xrange(langs.size())]
def GetAvailableLanguages(self):
"""Return list of available languages in the init data path"""
- cdef:
- GenericVector[STRING] v
- int i
+ IF TESSERACT_MAJOR_VERSION >= 5:
+ cdef:
+ vector[string] v
+ int i
+ ELSE:
+ cdef:
+ GenericVector[STRING] v
+ int i
langs = []
self._baseapi.GetAvailableLanguagesAsVector(&v)
langs = [v[i].c_str() for i in xrange(v.size())]
@@ -1568,6 +1631,28 @@
self._destroy_pix()
self._baseapi.SetImage(cimagedata, width, height, bytes_per_pixel,
bytes_per_line)
+ def SetImageBytesBmp(self, imagedata):
+ """Provide an image for Tesseract to recognize.
+
+ Args:
+ imagedata (:bytes): Raw bytes of a BMP image.
+
+ Raises:
+ :exc:`RuntimeError`: If for any reason the api failed
+ to load the given image.
+ """
+ cdef:
+ bytes py_imagedata = _b(imagedata)
+ size_t size = len(py_imagedata)
+ cuchar_t *cimagedata = py_imagedata
+ with nogil:
+ self._destroy_pix()
+ self._pix = pixReadMemBmp(cimagedata, size)
+ if self._pix == NULL:
+ with gil:
+ raise RuntimeError('Error reading image')
+ self._baseapi.SetImage(self._pix)
+
def SetImage(self, image):
"""Provide an image for Tesseract to recognize.
@@ -1598,7 +1683,7 @@
self._baseapi.SetImage(self._pix)
def SetImageFile(self, filename):
- """Set image from file for Tesserac to recognize.
+ """Set image from file for Tesseract to recognize.
Args:
filename (str): Image file relative or absolute path.
@@ -1615,7 +1700,10 @@
self._pix = pixRead(fname)
if self._pix == NULL:
with gil:
- raise RuntimeError('Error reading image')
+ # missing leptonica support? Try PIL
+ image = Image.open(fname)
+ self.SetImage(image)
+
self._baseapi.SetImage(self._pix)
def SetSourceResolution(self, int ppi):
@@ -1633,7 +1721,7 @@
can be recognized with the same image.
Args:
- left (int): poisition from left
+ left (int): position from left
top (int): position from top
width (int): width
height (int): height
@@ -1951,20 +2039,21 @@
"""Methods to retrieve information after :meth:`SetImage`,
:meth:`Recognize` or :meth:`TesseractRect`. (:meth:`Recognize` is called
implicitly if needed.)"""
- cpdef bool RecognizeForChopTest(self, int timeout=0):
- """Variant on :meth:`Recognize` used for testing chopper."""
- cdef:
- ETEXT_DESC monitor
- int res
- with nogil:
- if timeout > 0:
- monitor.cancel = NULL
- monitor.cancel_this = NULL
- monitor.set_deadline_msecs(timeout)
- res = self._baseapi.RecognizeForChopTest(&monitor)
- else:
- res = self._baseapi.RecognizeForChopTest(NULL)
- return res == 0
+ IF TESSERACT_MAJOR_VERSION < 5:
+ cpdef bool RecognizeForChopTest(self, int timeout=0):
+ """Variant on :meth:`Recognize` used for testing chopper."""
+ cdef:
+ ETEXT_DESC monitor
+ int res
+ with nogil:
+ if timeout > 0:
+ monitor.cancel = NULL
+ monitor.cancel_this = NULL
+ monitor.set_deadline_msecs(timeout)
+ res = self._baseapi.RecognizeForChopTest(&monitor)
+ else:
+ res = self._baseapi.RecognizeForChopTest(NULL)
+ return res == 0
cdef TessResultRenderer *_get_renderer(self, cchar_t *outputbase):
cdef:
@@ -2084,7 +2173,7 @@
retry_config=None, int timeout=0):
"""Turn a single image into symbolic text.
- See :meth:`ProcessPages` for desciptions of the keyword arguments
+ See :meth:`ProcessPages` for descriptions of the keyword arguments
and all other details.
Args:
@@ -2258,7 +2347,6 @@
'script_conf': script_conf}
return None
-
def MeanTextConf(self):
"""Return the (average) confidence value between 0 and 100."""
return self._baseapi.MeanTextConf()
@@ -2536,11 +2624,18 @@
- path (str): tessdata parent directory path
- languages (list): list of available languages as ISO 639-3
strings.
"""
- cdef:
- bytes py_path = _b(path)
- TessBaseAPI baseapi
- GenericVector[STRING] v
- int i
+ IF TESSERACT_MAJOR_VERSION >= 5:
+ cdef:
+ bytes py_path = _b(path)
+ TessBaseAPI baseapi
+ vector[string] v
+ int i
+ ELSE:
+ cdef:
+ bytes py_path = _b(path)
+ TessBaseAPI baseapi
+ GenericVector[STRING] v
+ int i
baseapi.Init(py_path, NULL)
path = baseapi.GetDatapath()
baseapi.GetAvailableLanguagesAsVector(&v)
Binary files old/tesserocr-2.5.1/tests/eurotext.png and
new/tesserocr-2.5.2/tests/eurotext.png differ
Binary files old/tesserocr-2.5.1/tests/eurotext.tif and
new/tesserocr-2.5.2/tests/eurotext.tif differ
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.5.1/tests/test_api.py
new/tesserocr-2.5.2/tests/test_api.py
--- old/tesserocr-2.5.1/tests/test_api.py 2019-11-08 23:49:38.000000000
+0100
+++ new/tesserocr-2.5.2/tests/test_api.py 2021-06-19 22:02:07.000000000
+0200
@@ -2,8 +2,10 @@
import re
import os.path
import tesserocr
+
try:
from PIL import Image
+
pil_installed = True
except ImportError:
pil_installed = False
@@ -12,8 +14,8 @@
def version_to_int(version):
subversion = None
subtrahend = 0
- # Subtracts a certain amount from the version number to differentiate
between
- # alpha, beta and release versions.
+ # Subtracts a certain amount from the version number to differentiate
+ # between alpha, beta and release versions.
if "alpha" in version:
version_split = version.split("alpha")
subversion = version_split[1]
@@ -22,18 +24,19 @@
version_split = version.split("beta")
subversion = version_split[1]
subtrahend = 1
- version = re.search(r'((?:\d+\.)+\d+)', version).group()
- # Split the groups on ".", take only the first one, and print each group
with leading 0 if needed
- # To be safe, also handle cases where an extra group is added to the
version string, or if one or two groups
- # are dropped.
- version_groups = (version.split('.') + [0, 0])[:3]
+ version = re.search(r"((?:\d+\.)+\d+)", version).group()
+ # Split the groups on ".", take only the first one, and print each
+ # group with leading 0 if needed. To be safe, also handle cases where
+ # an extra group is added to the version string, or if one or two
+ # groups are dropped.
+ version_groups = (version.split(".") + [0, 0])[:3]
version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
version_str = str((int(version_str, 10) - subtrahend))
# Adds a 2 digit subversion number for the subversionrelease.
subversion_str = "00"
- if subversion is not None and subversion is not "":
- subversion = re.search(r'(?:\d+)', subversion).group()
- subversion_groups = (subversion.split('-') + [0, 0])[:1]
+ if subversion is not None and subversion != "":
+ subversion = re.search(r"(?:\d+)", subversion).group()
+ subversion_groups = (subversion.split("-") + [0, 0])[:1]
subversion_str = "{:02}".format(*map(int, subversion_groups))
version_str += subversion_str
return int(version_str, 16)
@@ -45,11 +48,11 @@
class TestTessBaseApi(unittest.TestCase):
_test_dir = os.path.abspath(os.path.dirname(__file__))
- _image_file = os.path.join(_test_dir, 'eurotext.tif')
+ _image_file = os.path.join(_test_dir, "eurotext.png")
def setUp(self):
if pil_installed:
- with open(self._image_file, 'rb') as f:
+ with open(self._image_file, "rb") as f:
self._image = Image.open(f)
self._image.load()
self._api = tesserocr.PyTessBaseAPI(init=True)
@@ -71,27 +74,27 @@
def test_init_full(self):
"""Test InitFull."""
# check default settings
- self.assertEqual(self._api.GetVariableAsString('file_type'), '.tif')
- self.assertEqual(self._api.GetVariableAsString('edges_childarea'),
'0.5')
+ self.assertEqual(self._api.GetVariableAsString("file_type"), ".tif")
+ self.assertEqual(self._api.GetVariableAsString("edges_childarea"),
"0.5")
# use box.train config variables
- configs = ['box.train']
+ configs = ["box.train"]
# change edges_childarea
- vars_ = {'edges_childarea': '0.7'}
+ vars_ = {"edges_childarea": "0.7"}
self._api.End()
self._api.InitFull(configs=configs, variables=vars_)
# assert file_type from box.train and custom edges_childarea
- self.assertEqual(self._api.GetVariableAsString('file_type'), '.bl')
- self.assertEqual(self._api.GetVariableAsString('edges_childarea'),
'0.7')
+ self.assertEqual(self._api.GetVariableAsString("file_type"), ".bl")
+ self.assertEqual(self._api.GetVariableAsString("edges_childarea"),
"0.7")
# reset back to default
self._api.End()
self._api.Init()
def test_init(self):
"""Test Init calls with different lang and oem."""
- self._api.Init(lang='eng+osd')
- self.assertEqual(self._api.GetInitLanguagesAsString(), 'eng+osd')
- self._api.Init(lang='eng')
- self.assertEqual(self._api.GetInitLanguagesAsString(), 'eng')
+ self._api.Init(lang="eng+osd")
+ self.assertEqual(self._api.GetInitLanguagesAsString(), "eng+osd")
+ self._api.Init(lang="eng")
+ self.assertEqual(self._api.GetInitLanguagesAsString(), "eng")
self._api.Init(oem=tesserocr.OEM.TESSERACT_ONLY)
self.assertEqual(self._api.oem(), tesserocr.OEM.TESSERACT_ONLY)
@@ -100,7 +103,7 @@
"""Test SetImage and GetUTF8Text."""
self._api.SetImage(self._image)
text = self._api.GetUTF8Text()
- self.assertIn('quick', text)
+ self.assertIn("quick", text)
text2 = tesserocr.image_to_text(self._image)
self.assertEqual(text, text2)
@@ -108,7 +111,7 @@
"""Test SetImageFile and GetUTF8Text."""
self._api.SetImageFile(self._image_file)
text = self._api.GetUTF8Text()
- self.assertIn('quick', text)
+ self.assertIn("quick", text)
text2 = tesserocr.file_to_text(self._image_file)
self.assertEqual(text, text2)
@@ -134,7 +137,9 @@
"""Test GetDatapath and Init with an invalid data path."""
path = self._api.GetDatapath()
self._api.End()
- self.assertRaises(RuntimeError, self._api.Init, path=(self._test_dir +
os.path.sep)) # no tessdata
+ self.assertRaises(
+ RuntimeError, self._api.Init, path=(self._test_dir + os.path.sep)
+ ) # no tessdata
if _TESSERACT_VERSION >= 0x3999800:
new_path = path
else:
@@ -145,17 +150,17 @@
def test_langs(self):
"""Test get langs methods."""
- self._api.Init(lang='eng')
+ self._api.Init(lang="eng")
lang = self._api.GetInitLanguagesAsString()
- self.assertEqual(lang, 'eng')
+ self.assertEqual(lang, "eng")
langs = self._api.GetLoadedLanguages()
- self.assertEqual(langs, ['eng'])
- self.assertIn('eng', self._api.GetAvailableLanguages())
+ self.assertEqual(langs, ["eng"])
+ self.assertIn("eng", self._api.GetAvailableLanguages())
def test_variables(self):
"""Test SetVariable and GetVariableAsString."""
- self._api.SetVariable('debug_file', '/dev/null')
- self.assertEqual(self._api.GetVariableAsString('debug_file'),
'/dev/null')
+ self._api.SetVariable("debug_file", "/dev/null")
+ self.assertEqual(self._api.GetVariableAsString("debug_file"),
"/dev/null")
@unittest.skipIf(not pil_installed, "Pillow not installed")
def test_rectangle(self):
@@ -223,17 +228,25 @@
self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY)
self._api.SetImageFile(self._image_file)
orientation = self._api.DetectOS()
- all(self.assertIn(k, orientation) for k in ['sconfidence',
'oconfidence', 'script', 'orientation'])
- self.assertEqual(orientation['orientation'], 0)
- languages = tesserocr.get_languages()[1] # this is sorted
alphabetically!
- self.assertLess(orientation['script'], len(languages))
- script_name = languages[orientation['script']] # therefore does not
work
- #self.assertEqual(script_name, 'Latin') # cannot test: not reliable
+ all(
+ self.assertIn(k, orientation)
+ for k in ["sconfidence", "oconfidence", "script", "orientation"]
+ )
+ self.assertEqual(orientation["orientation"], 0)
+ # this is sorted alphabetically!
+ languages = tesserocr.get_languages()[1]
+ self.assertLess(orientation["script"], len(languages))
+ # therefore does not work
+ # script_name = languages[orientation["script"]]
+ # self.assertEqual(script_name, 'Latin') # cannot test: not reliable
if _TESSERACT_VERSION >= 0x3999800:
orientation = self._api.DetectOrientationScript()
- all(self.assertIn(k, orientation) for k in ['orient_deg',
'orient_conf', 'script_name', 'script_conf'])
- self.assertEqual(orientation['orient_deg'], 0)
- self.assertEqual(orientation['script_name'], 'Latin')
+ all(
+ self.assertIn(k, orientation)
+ for k in ["orient_deg", "orient_conf", "script_name",
"script_conf"]
+ )
+ self.assertEqual(orientation["orient_deg"], 0)
+ self.assertEqual(orientation["script_name"], "Latin")
def test_clear(self):
"""Test Clear."""
@@ -272,10 +285,10 @@
result = self._api.GetComponentImages(tesserocr.RIL.BLOCK, True)
# Test if not empty
self.assertTrue(result)
- _, xywh, _, _ = result[0] # bbox of largest
- self.assertIn('w', xywh)
- self.assertIn('h', xywh)
- area = xywh['w'] * xywh['h']
+ _, xywh, _, _ = result[0] # bbox of largest
+ self.assertIn("w", xywh)
+ self.assertIn("h", xywh)
+ area = xywh["w"] * xywh["h"]
# Test if the largest block is quite large
self.assertGreater(area, 400000)
@@ -286,7 +299,7 @@
# Test if not empty
self.assertTrue(layout)
self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK))
- result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest
+ result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest
self.assertIsNot(result, None)
x0, y0, x1, y1 = result
area = (x1 - x0) * (y1 - y0)
@@ -300,7 +313,7 @@
# Test if not empty
self.assertTrue(layout)
self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK))
- result = layout.BlockPolygon() # polygon of largest
+ result = layout.BlockPolygon() # polygon of largest
# Test if not empty
self.assertIsNot(result, None)
# Test there are at least 4 contour points
@@ -318,7 +331,7 @@
res = self._api.Recognize(1)
self.assertFalse(res)
self._api.SetImageFile(self._image_file)
- # timeout after 10 seocnds (unlikely)
+ # timeout after 10 seconds (unlikely)
res = self._api.Recognize(10000)
self.assertTrue(res)
self._api.SetImageFile(self._image_file)
@@ -332,10 +345,10 @@
self._api.Recognize()
it = self._api.GetIterator()
attrs = it.RowAttributes()
- self.assertIsInstance(attrs['row_height'], float)
- self.assertIsInstance(attrs['ascenders'], float)
- self.assertIsInstance(attrs['descenders'], float)
+ self.assertIsInstance(attrs["row_height"], float)
+ self.assertIsInstance(attrs["ascenders"], float)
+ self.assertIsInstance(attrs["descenders"], float)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()