Hello community, here is the log from the commit of package python-tesserocr for openSUSE:Leap:15.2 checked in at 2020-03-02 13:23:52 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Leap:15.2/python-tesserocr (Old) and /work/SRC/openSUSE:Leap:15.2/.python-tesserocr.new.26092 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-tesserocr" Mon Mar 2 13:23:52 2020 rev:11 rq:777210 version:2.5.0 Changes: -------- --- /work/SRC/openSUSE:Leap:15.2/python-tesserocr/python-tesserocr.changes 2020-01-15 15:53:46.455609283 +0100 +++ /work/SRC/openSUSE:Leap:15.2/.python-tesserocr.new.26092/python-tesserocr.changes 2020-03-02 13:23:53.774462648 +0100 @@ -1,0 +2,67 @@ +Mon Jan 13 17:25:21 UTC 2020 - [email protected] + +- disable test_LSTM_choices temporarily + https://github.com/sirfz/tesserocr/issues/214 + +------------------------------------------------------------------- +Tue Nov 26 00:49:44 UTC 2019 - Martin Herkt <[email protected]> + +- Update to version 2.5.0 + * Support for RowAttributes method in LTRResultIterator + * SetImage: use PNG instead of JPEG fallback + * Replace STRING::string() by c_str() + * Don't use assignment operator for TessBaseAPI + +------------------------------------------------------------------- +Fri Aug 23 18:14:51 UTC 2019 - Martin Herkt <[email protected]> + +- Update to version 2.4.1 + * fix pixa_to_list python3 segfault + * fix BlockPolygon python3 segfault + +------------------------------------------------------------------- +Thu Dec 6 12:42:12 UTC 2018 - Jan Engelhardt <[email protected]> + +- Trim bias and filler wording. + +------------------------------------------------------------------- +Wed Dec 5 23:35:00 UTC 2018 - Martin Herkt <[email protected]> + +- Update to version 2.4.0 + Tesseract v4 new API methods supported: + + * GetBestLSTMSymbolChoices + * BlanWksBeforeWord + +------------------------------------------------------------------- +Mon Aug 13 18:20:26 UTC 2018 - [email protected] + +- Update to version 2.3.1 + * Python 3.7 support release + +------------------------------------------------------------------- +Thu Aug 2 09:40:45 UTC 2018 - [email protected] + +- Ensure we require some of the tesseract data so we can do at + least some basic ocr operations + +------------------------------------------------------------------- +Thu Aug 2 08:59:20 UTC 2018 - [email protected] + +- Drop unused bcond + +------------------------------------------------------------------- +Tue Jun 26 16:07:43 UTC 2018 - [email protected] + +- Run tests +- Use %license macro +- Update to version 2.3.0 + * Support for Tesseract 4 + + New OCR engines LSTM_ONLY and TESSERACT_LSTM_COMBINED + + New default tessdata path handling + * Fixed compilation against Tesseract v3.05.02 which required + c++11 + * Fallback to 'eng' as default language when default language + returned by the API is empty + +------------------------------------------------------------------- Old: ---- _service tesserocr-2.2.2.tar.gz New: ---- tesserocr-2.5.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-tesserocr.spec ++++++ --- /var/tmp/diff_new_pack.kuBNp8/_old 2020-03-02 13:23:54.130463357 +0100 +++ /var/tmp/diff_new_pack.kuBNp8/_new 2020-03-02 13:23:54.134463365 +0100 @@ -1,7 +1,7 @@ # # spec file for package python-tesserocr # -# Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2020 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -12,39 +12,44 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# Please submit bugfixes or comments via https://bugs.opensuse.org/ # %{?!python_module:%define python_module() python-%{**} python3-%{**}} -%bcond_without test Name: python-tesserocr -Version: 2.2.2 +Version: 2.5.0 Release: 0 -Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr +Summary: A Python wrapper around tesseract-ocr License: MIT Group: Development/Languages/Python -Url: https://github.com/sirfz/tesserocr +URL: https://github.com/sirfz/tesserocr Source: https://files.pythonhosted.org/packages/source/t/tesserocr/tesserocr-%{version}.tar.gz BuildRequires: %{python_module Cython} -BuildRequires: %{python_module six} +BuildRequires: %{python_module Pillow} BuildRequires: %{python_module devel} +BuildRequires: %{python_module pytest} BuildRequires: %{python_module setuptools} +BuildRequires: %{python_module six} BuildRequires: gcc-c++ BuildRequires: pkgconfig BuildRequires: python-rpm-macros +BuildRequires: tesseract-ocr-traineddata-english +BuildRequires: tesseract-ocr-traineddata-orientation_and_script_detection BuildRequires: pkgconfig(tesseract) +Requires: tesseract-ocr-traineddata-english +Requires: tesseract-ocr-traineddata-orientation_and_script_detection Recommends: python-Pillow %python_subpackages %description -A simple, Pillow-friendly wrapper around the tesseract-ocr API for Optical -Character Recognition (OCR). +A wrapper around the tesseract-ocr API for Optical Character +Recognition (OCR). tesserocr integrates directly with Tesseract's C++ API using Cython -which allows for simple Pythonic and easy-to-read source code. It -enables real concurrent execution when used with Python's threading -module by releasing the GIL while processing an image in tesseract. +which allows for Pythonic source code. It enables real concurrent +execution when used with Python's threading module by releasing the +GIL while processing an image in tesseract. %prep %setup -q -n tesserocr-%{version} @@ -55,9 +60,14 @@ %install %python_install +%check +%python_exec setup.py develop --user +# test_LSTM_choices failure: https://github.com/sirfz/tesserocr/issues/214 +%python_exec -m pytest -v -k 'not test_LSTM_choices' tests + %files %{python_files} -%defattr(-,root,root,-) -%doc LICENSE README.rst +%license LICENSE +%doc README.rst %{python_sitearch}/* %changelog ++++++ tesserocr-2.2.2.tar.gz -> tesserocr-2.5.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/PKG-INFO new/tesserocr-2.5.0/PKG-INFO --- old/tesserocr-2.2.2/PKG-INFO 2017-07-26 20:53:03.000000000 +0200 +++ new/tesserocr-2.5.0/PKG-INFO 2019-11-09 00:11:54.000000000 +0100 @@ -1,6 +1,6 @@ -Metadata-Version: 1.1 +Metadata-Version: 2.1 Name: tesserocr -Version: 2.2.2 +Version: 2.5.0 Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython Home-page: https://github.com/sirfz/tesserocr Author: Fayez Zouheiry @@ -45,7 +45,7 @@ :: - $ apt-get install tesseract-ocr libtesseract-dev libleptonica-dev + $ apt-get install tesseract-ocr libtesseract-dev libleptonica-dev pkg-config You may need to `manually compile tesseract`_ for a more recent version. Note that you may need to update your ``LD_LIBRARY_PATH`` environment variable to point to the right library versions in @@ -59,7 +59,8 @@ Installation ============ - + Linux and BSD/MacOS + ------------------- :: $ pip install tesserocr @@ -82,6 +83,31 @@ .. |pkg-config| replace:: **pkg-config** .. _pkg-config: https://pkgconfig.freedesktop.org/ + Windows + ------- + + The proposed downloads consist of stand-alone packages containing all the Windows libraries needed for execution. This means that no additional installation of tesseract is required on your system. + + The recommended method of installation is via Conda as described below. + + Conda + ````` + + You can use the `conda-forge <https://anaconda.org/conda-forge/tesserocr>`_ channel to install from Conda: + + :: + + > conda install -c conda-forge tesserocr + + pip + ``` + + Download the wheel file corresponding to your Windows platform and Python installation from `simonflueckiger/tesserocr-windows_build/releases <https://github.com/simonflueckiger/tesserocr-windows_build/releases>`_ and install them via: + + :: + + > pip install <package_name>.whl + Usage ===== @@ -97,8 +123,8 @@ with PyTessBaseAPI() as api: for img in images: api.SetImageFile(img) - print api.GetUTF8Text() - print api.AllWordConfidences() + print(api.GetUTF8Text()) + print(api.AllWordConfidences()) # api is automatically finalized when used in a with-statement (context manager). # otherwise api.End() should be explicitly called when it's no longer needed. @@ -112,13 +138,13 @@ import tesserocr from PIL import Image - print tesserocr.tesseract_version() # print tesseract-ocr version - print tesserocr.get_languages() # prints tessdata path and list of available languages + print(tesserocr.tesseract_version()) # print tesseract-ocr version + print(tesserocr.get_languages()) # prints tessdata path and list of available languages image = Image.open('sample.jpg') - print tesserocr.image_to_text(image) # print ocr text from image + print(tesserocr.image_to_text(image)) # print ocr text from image # or - print tesserocr.file_to_text('sample.jpg') + print(tesserocr.file_to_text('sample.jpg')) ``image_to_text`` and ``file_to_text`` can be used with ``threading`` to concurrently process multiple images which is highly efficient. @@ -138,15 +164,15 @@ with PyTessBaseAPI() as api: api.SetImage(image) boxes = api.GetComponentImages(RIL.TEXTLINE, True) - print 'Found {} textline image components.'.format(len(boxes)) + print('Found {} textline image components.'.format(len(boxes))) for i, (im, box, _, _) in enumerate(boxes): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text() conf = api.MeanTextConf() - print (u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, " - "confidence: {1}, text: {2}").format(i, conf, ocrResult, **box) + print(u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, " + "confidence: {1}, text: {2}".format(i, conf, ocrResult, **box)) Orientation and script detection (OSD): ``````````````````````````````````````` @@ -163,10 +189,10 @@ it = api.AnalyseLayout() orientation, direction, order, deskew_angle = it.Orientation() - print "Orientation: {:d}".format(orientation) - print "WritingDirection: {:d}".format(direction) - print "TextlineOrder: {:d}".format(order) - print "Deskew angle: {:.4f}".format(deskew_angle) + print("Orientation: {:d}".format(orientation)) + print("WritingDirection: {:d}".format(direction)) + print("TextlineOrder: {:d}".format(order)) + print("Deskew angle: {:.4f}".format(deskew_angle)) or more simply with ``OSD_ONLY`` page segmentation mode: @@ -178,8 +204,8 @@ api.SetImageFile("/usr/src/tesseract/testing/eurotext.tif") os = api.DetectOS() - print ("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n" - "Script: {script}\nScript confidence: {sconfidence}").format(**os) + print("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n" + "Script: {script}\nScript confidence: {sconfidence}".format(**os)) more human-readable info with tesseract 4+ (demonstrates LSTM engine usage): @@ -191,14 +217,16 @@ api.SetImageFile("/usr/src/tesseract/testing/eurotext.tif") os = api.DetectOrientationScript() - print ("Orientation: {orient_deg}\nOrientation confidence: {orient_conf}\n" - "Script: {script_name}\nScript confidence: {script_conf}").format(**os) + print("Orientation: {orient_deg}\nOrientation confidence: {orient_conf}\n" + "Script: {script_name}\nScript confidence: {script_conf}".format(**os)) Iterator over the classifier choices for a single symbol: ````````````````````````````````````````````````````````` .. code:: python + from __future__ import print_function + from tesserocr import PyTessBaseAPI, RIL, iterate_level with PyTessBaseAPI() as api: @@ -213,17 +241,17 @@ symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) if symbol: - print u'symbol {}, conf: {}'.format(symbol, conf), + print(u'symbol {}, conf: {}'.format(symbol, conf), end='') indent = False ci = r.GetChoiceIterator() for c in ci: if indent: - print '\t\t ', - print '\t- ', + print('\t\t ', end='') + print('\t- ', end='') choice = c.GetUTF8Text() # c == ci - print u'{} conf: {}'.format(choice, c.Confidence()) + print(u'{} conf: {}'.format(choice, c.Confidence())) indent = True - print '---------------------------------------------' + print('---------------------------------------------') Keywords: Tesseract,tesseract-ocr,OCR,optical character recognition,PIL,Pillow,Cython Platform: UNKNOWN @@ -236,11 +264,11 @@ Classifier: Operating System :: POSIX Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.2 -Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Programming Language :: Cython +Description-Content-Type: text/x-rst diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/README.rst new/tesserocr-2.5.0/README.rst --- old/tesserocr-2.2.2/README.rst 2017-05-28 16:31:28.000000000 +0200 +++ new/tesserocr-2.5.0/README.rst 2019-11-09 00:11:39.000000000 +0100 @@ -37,7 +37,7 @@ :: - $ apt-get install tesseract-ocr libtesseract-dev libleptonica-dev + $ apt-get install tesseract-ocr libtesseract-dev libleptonica-dev pkg-config You may need to `manually compile tesseract`_ for a more recent version. Note that you may need to update your ``LD_LIBRARY_PATH`` environment variable to point to the right library versions in @@ -51,7 +51,8 @@ Installation ============ - +Linux and BSD/MacOS +------------------- :: $ pip install tesserocr @@ -74,6 +75,31 @@ .. |pkg-config| replace:: **pkg-config** .. _pkg-config: https://pkgconfig.freedesktop.org/ +Windows +------- + +The proposed downloads consist of stand-alone packages containing all the Windows libraries needed for execution. This means that no additional installation of tesseract is required on your system. + +The recommended method of installation is via Conda as described below. + +Conda +````` + +You can use the `conda-forge <https://anaconda.org/conda-forge/tesserocr>`_ channel to install from Conda: + +:: + + > conda install -c conda-forge tesserocr + +pip +``` + +Download the wheel file corresponding to your Windows platform and Python installation from `simonflueckiger/tesserocr-windows_build/releases <https://github.com/simonflueckiger/tesserocr-windows_build/releases>`_ and install them via: + +:: + + > pip install <package_name>.whl + Usage ===== @@ -89,8 +115,8 @@ with PyTessBaseAPI() as api: for img in images: api.SetImageFile(img) - print api.GetUTF8Text() - print api.AllWordConfidences() + print(api.GetUTF8Text()) + print(api.AllWordConfidences()) # api is automatically finalized when used in a with-statement (context manager). # otherwise api.End() should be explicitly called when it's no longer needed. @@ -104,13 +130,13 @@ import tesserocr from PIL import Image - print tesserocr.tesseract_version() # print tesseract-ocr version - print tesserocr.get_languages() # prints tessdata path and list of available languages + print(tesserocr.tesseract_version()) # print tesseract-ocr version + print(tesserocr.get_languages()) # prints tessdata path and list of available languages image = Image.open('sample.jpg') - print tesserocr.image_to_text(image) # print ocr text from image + print(tesserocr.image_to_text(image)) # print ocr text from image # or - print tesserocr.file_to_text('sample.jpg') + print(tesserocr.file_to_text('sample.jpg')) ``image_to_text`` and ``file_to_text`` can be used with ``threading`` to concurrently process multiple images which is highly efficient. @@ -130,15 +156,15 @@ with PyTessBaseAPI() as api: api.SetImage(image) boxes = api.GetComponentImages(RIL.TEXTLINE, True) - print 'Found {} textline image components.'.format(len(boxes)) + print('Found {} textline image components.'.format(len(boxes))) for i, (im, box, _, _) in enumerate(boxes): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text() conf = api.MeanTextConf() - print (u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, " - "confidence: {1}, text: {2}").format(i, conf, ocrResult, **box) + print(u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, " + "confidence: {1}, text: {2}".format(i, conf, ocrResult, **box)) Orientation and script detection (OSD): ``````````````````````````````````````` @@ -155,10 +181,10 @@ it = api.AnalyseLayout() orientation, direction, order, deskew_angle = it.Orientation() - print "Orientation: {:d}".format(orientation) - print "WritingDirection: {:d}".format(direction) - print "TextlineOrder: {:d}".format(order) - print "Deskew angle: {:.4f}".format(deskew_angle) + print("Orientation: {:d}".format(orientation)) + print("WritingDirection: {:d}".format(direction)) + print("TextlineOrder: {:d}".format(order)) + print("Deskew angle: {:.4f}".format(deskew_angle)) or more simply with ``OSD_ONLY`` page segmentation mode: @@ -170,8 +196,8 @@ api.SetImageFile("/usr/src/tesseract/testing/eurotext.tif") os = api.DetectOS() - print ("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n" - "Script: {script}\nScript confidence: {sconfidence}").format(**os) + print("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n" + "Script: {script}\nScript confidence: {sconfidence}".format(**os)) more human-readable info with tesseract 4+ (demonstrates LSTM engine usage): @@ -183,14 +209,16 @@ api.SetImageFile("/usr/src/tesseract/testing/eurotext.tif") os = api.DetectOrientationScript() - print ("Orientation: {orient_deg}\nOrientation confidence: {orient_conf}\n" - "Script: {script_name}\nScript confidence: {script_conf}").format(**os) + print("Orientation: {orient_deg}\nOrientation confidence: {orient_conf}\n" + "Script: {script_name}\nScript confidence: {script_conf}".format(**os)) Iterator over the classifier choices for a single symbol: ````````````````````````````````````````````````````````` .. code:: python + from __future__ import print_function + from tesserocr import PyTessBaseAPI, RIL, iterate_level with PyTessBaseAPI() as api: @@ -205,14 +233,14 @@ symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) if symbol: - print u'symbol {}, conf: {}'.format(symbol, conf), + print(u'symbol {}, conf: {}'.format(symbol, conf), end='') indent = False ci = r.GetChoiceIterator() for c in ci: if indent: - print '\t\t ', - print '\t- ', + print('\t\t ', end='') + print('\t- ', end='') choice = c.GetUTF8Text() # c == ci - print u'{} conf: {}'.format(choice, c.Confidence()) + print(u'{} conf: {}'.format(choice, c.Confidence())) indent = True - print '---------------------------------------------' + print('---------------------------------------------') diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/setup.py new/tesserocr-2.5.0/setup.py --- old/tesserocr-2.2.2/setup.py 2017-07-26 20:25:09.000000000 +0200 +++ new/tesserocr-2.5.0/setup.py 2019-11-08 23:49:38.000000000 +0100 @@ -48,8 +48,33 @@ def version_to_int(version): + subversion = None + subtrahend = 0 + # Subtracts a certain amount from the version number to differentiate between + # alpha, beta and release versions. + if "alpha" in version: + version_split = version.split("alpha") + subversion = version_split[1] + subtrahend = 2 + elif "beta" in version: + version_split = version.split("beta") + subversion = version_split[1] + subtrahend = 1 version = re.search(r'((?:\d+\.)+\d+)', version).group() - return int(''.join(version.split('.')), 16) + # Split the groups on ".", take only the first one, and print each group with leading 0 if needed + # To be safe, also handle cases where an extra group is added to the version string, or if one or two groups + # are dropped. + version_groups = (version.split('.') + [0, 0])[:3] + version_str = "{:02}{:02}{:02}".format(*map(int, version_groups)) + version_str = str((int(version_str, 10)-subtrahend)) + # Adds a 2 digit subversion number for the subversionrelease. + subversion_str="00" + if subversion is not None and subversion is not "": + subversion = re.search(r'(?:\d+)', subversion).group() + subversion_groups = (subversion.split('-') + [0, 0])[:1] + subversion_str = "{:02}".format(*map(int, subversion_groups)) + version_str+=subversion_str + return int(version_str, 16) def package_config(): @@ -127,9 +152,9 @@ _LOGGER.warn('pkg-config failed to find tesseract/lept libraries: {}'.format(e)) build_args = get_tesseract_version() - if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x040000: - _LOGGER.debug('tesseract >= 4.00 requires c++11 compiler support') - build_args['extra_compile_args'] = ['-std=c++11'] + if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x3050200: + _LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler support') + build_args['extra_compile_args'] = ['-std=c++11', '-DUSE_STD_NAMESPACE'] _LOGGER.debug('build parameters: {}'.format(build_args)) return build_args @@ -154,6 +179,7 @@ version=find_version('tesserocr.pyx'), description='A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython', long_description=read('README.rst'), + long_description_content_type='text/x-rst', url='https://github.com/sirfz/tesserocr', author='Fayez Zouheiry', author_email='[email protected]', @@ -168,11 +194,10 @@ 'Operating System :: POSIX', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Programming Language :: Cython' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/tesseract.pxd new/tesserocr-2.5.0/tesseract.pxd --- old/tesserocr-2.2.2/tesseract.pxd 2017-06-18 23:26:40.000000000 +0200 +++ new/tesserocr-2.5.0/tesseract.pxd 2019-11-08 23:49:38.000000000 +0100 @@ -1,5 +1,8 @@ from libcpp cimport bool +from libcpp.pair cimport pair +from libcpp.vector cimport vector ctypedef const char cchar_t +ctypedef const char * cchar_tp ctypedef const unsigned char cuchar_t cdef extern from "leptonica/allheaders.h" nogil: @@ -33,6 +36,7 @@ int pixWriteMemJpeg(unsigned char **, size_t *, Pix *, int, int) int pixWriteMem(unsigned char **, size_t *, Pix *, int) void pixDestroy(Pix **) + void ptaDestroy(Pta **) int setMsgSeverity(int) void pixaDestroy(Pixa **) void boxaDestroy(Boxa **) @@ -106,7 +110,7 @@ cdef extern from "tesseract/strngs.h" nogil: cdef cppclass STRING: - cchar_t *string() const + cchar_t *c_str() const STRING &operator=(cchar_t *) cdef extern from "tesseract/ocrclass.h" nogil: @@ -139,27 +143,75 @@ void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int *) const cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil: - cdef cppclass LTRResultIterator(PageIterator): - char *GetUTF8Text(PageIteratorLevel) const - void SetLineSeparator(cchar_t *) - void SetParagraphSeparator(cchar_t *) - float Confidence(PageIteratorLevel) const - cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const - cchar_t *WordRecognitionLanguage() const - StrongScriptDirection WordDirection() const - bool WordIsFromDictionary() const - bool WordIsNumeric() const - bool HasBlamerInfo() const - cchar_t *GetBlamerDebug() const - cchar_t *GetBlamerMisadaptionDebug() const - bool HasTruthString() const - bool EquivalentToTruth(cchar_t *) const - char *WordTruthUTF8Text() const - char *WordNormedUTF8Text() const - cchar_t *WordLattice(int *) const - bool SymbolIsSuperscript() const - bool SymbolIsSubscript() const - bool SymbolIsDropcap() const + IF TESSERACT_VERSION >= 0x4000000: + cdef cppclass LTRResultIterator(PageIterator): + char *GetUTF8Text(PageIteratorLevel) const + void SetLineSeparator(cchar_t *) + void SetParagraphSeparator(cchar_t *) + float Confidence(PageIteratorLevel) const + void RowAttributes(float *, float *, float *) const + cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const + cchar_t *WordRecognitionLanguage() const + StrongScriptDirection WordDirection() const + bool WordIsFromDictionary() const + int BlanksBeforeWord() const + bool WordIsNumeric() const + bool HasBlamerInfo() const + cchar_t *GetBlamerDebug() const + cchar_t *GetBlamerMisadaptionDebug() const + bool HasTruthString() const + bool EquivalentToTruth(cchar_t *) const + char *WordTruthUTF8Text() const + char *WordNormedUTF8Text() const + cchar_t *WordLattice(int *) const + bool SymbolIsSuperscript() const + bool SymbolIsSubscript() const + bool SymbolIsDropcap() const + ELIF TESSERACT_VERSION >= 0x3040100: + cdef cppclass LTRResultIterator(PageIterator): + char *GetUTF8Text(PageIteratorLevel) const + void SetLineSeparator(cchar_t *) + void SetParagraphSeparator(cchar_t *) + float Confidence(PageIteratorLevel) const + void RowAttributes(float *, float *, float *) const + cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const + cchar_t *WordRecognitionLanguage() const + StrongScriptDirection WordDirection() const + bool WordIsFromDictionary() const + bool WordIsNumeric() const + bool HasBlamerInfo() const + cchar_t *GetBlamerDebug() const + cchar_t *GetBlamerMisadaptionDebug() const + bool HasTruthString() const + bool EquivalentToTruth(cchar_t *) const + char *WordTruthUTF8Text() const + char *WordNormedUTF8Text() const + cchar_t *WordLattice(int *) const + bool SymbolIsSuperscript() const + bool SymbolIsSubscript() const + bool SymbolIsDropcap() const + ELSE: + cdef cppclass LTRResultIterator(PageIterator): + char *GetUTF8Text(PageIteratorLevel) const + void SetLineSeparator(cchar_t *) + void SetParagraphSeparator(cchar_t *) + float Confidence(PageIteratorLevel) const + cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const + cchar_t *WordRecognitionLanguage() const + StrongScriptDirection WordDirection() const + bool WordIsFromDictionary() const + bool WordIsNumeric() const + bool HasBlamerInfo() const + cchar_t *GetBlamerDebug() const + cchar_t *GetBlamerMisadaptionDebug() const + bool HasTruthString() const + bool EquivalentToTruth(cchar_t *) const + char *WordTruthUTF8Text() const + char *WordNormedUTF8Text() const + cchar_t *WordLattice(int *) const + bool SymbolIsSuperscript() const + bool SymbolIsSubscript() const + bool SymbolIsDropcap() const cdef cppclass ChoiceIterator: ChoiceIterator(const LTRResultIterator &) except + @@ -168,8 +220,13 @@ float Confidence() const cdef extern from "tesseract/resultiterator.h" namespace "tesseract" nogil: - cdef cppclass ResultIterator(LTRResultIterator): - bool ParagraphIsLtr() const + IF TESSERACT_VERSION >= 0x4000000: + cdef cppclass ResultIterator(LTRResultIterator): + bool ParagraphIsLtr() const + vector[vector[pair[cchar_tp, float]]] *GetBestLSTMSymbolChoices() const + ELSE: + cdef cppclass ResultIterator(LTRResultIterator): + bool ParagraphIsLtr() const cdef extern from "tesseract/renderer.h" namespace "tesseract" nogil: cdef cppclass TessResultRenderer: @@ -181,7 +238,7 @@ cdef cppclass TessHOcrRenderer(TessResultRenderer): TessHOcrRenderer(cchar_t *, bool) except + - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: cdef cppclass TessPDFRenderer(TessResultRenderer): TessPDFRenderer(cchar_t *, cchar_t *, bool) except + ELSE: @@ -194,7 +251,7 @@ cdef cppclass TessBoxTextRenderer(TessResultRenderer): TessBoxTextRenderer(cchar_t *) except + - IF TESSERACT_VERSION >= 0x030401: + IF TESSERACT_VERSION >= 0x3040100: cdef cppclass TessOsdRenderer(TessResultRenderer): TessOsdRenderer(cchar_t *) except + @@ -213,14 +270,12 @@ cdef extern from "tesseract/baseapi.h" namespace "tesseract" nogil: - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: cdef enum OcrEngineMode: OEM_TESSERACT_ONLY OEM_LSTM_ONLY OEM_TESSERACT_LSTM_COMBINED OEM_DEFAULT - OEM_CUBE_ONLY - OEM_TESSERACT_CUBE_COMBINED ELSE: cdef enum OcrEngineMode: OEM_TESSERACT_ONLY @@ -255,7 +310,7 @@ RIL_WORD, # within a textline. RIL_SYMBOL # character within a word. - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: cdef cppclass TessBaseAPI: TessBaseAPI() except + @staticmethod diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/tesserocr.egg-info/PKG-INFO new/tesserocr-2.5.0/tesserocr.egg-info/PKG-INFO --- old/tesserocr-2.2.2/tesserocr.egg-info/PKG-INFO 2017-07-26 20:53:03.000000000 +0200 +++ new/tesserocr-2.5.0/tesserocr.egg-info/PKG-INFO 2019-11-09 00:11:54.000000000 +0100 @@ -1,6 +1,6 @@ -Metadata-Version: 1.1 +Metadata-Version: 2.1 Name: tesserocr -Version: 2.2.2 +Version: 2.5.0 Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython Home-page: https://github.com/sirfz/tesserocr Author: Fayez Zouheiry @@ -45,7 +45,7 @@ :: - $ apt-get install tesseract-ocr libtesseract-dev libleptonica-dev + $ apt-get install tesseract-ocr libtesseract-dev libleptonica-dev pkg-config You may need to `manually compile tesseract`_ for a more recent version. Note that you may need to update your ``LD_LIBRARY_PATH`` environment variable to point to the right library versions in @@ -59,7 +59,8 @@ Installation ============ - + Linux and BSD/MacOS + ------------------- :: $ pip install tesserocr @@ -82,6 +83,31 @@ .. |pkg-config| replace:: **pkg-config** .. _pkg-config: https://pkgconfig.freedesktop.org/ + Windows + ------- + + The proposed downloads consist of stand-alone packages containing all the Windows libraries needed for execution. This means that no additional installation of tesseract is required on your system. + + The recommended method of installation is via Conda as described below. + + Conda + ````` + + You can use the `conda-forge <https://anaconda.org/conda-forge/tesserocr>`_ channel to install from Conda: + + :: + + > conda install -c conda-forge tesserocr + + pip + ``` + + Download the wheel file corresponding to your Windows platform and Python installation from `simonflueckiger/tesserocr-windows_build/releases <https://github.com/simonflueckiger/tesserocr-windows_build/releases>`_ and install them via: + + :: + + > pip install <package_name>.whl + Usage ===== @@ -97,8 +123,8 @@ with PyTessBaseAPI() as api: for img in images: api.SetImageFile(img) - print api.GetUTF8Text() - print api.AllWordConfidences() + print(api.GetUTF8Text()) + print(api.AllWordConfidences()) # api is automatically finalized when used in a with-statement (context manager). # otherwise api.End() should be explicitly called when it's no longer needed. @@ -112,13 +138,13 @@ import tesserocr from PIL import Image - print tesserocr.tesseract_version() # print tesseract-ocr version - print tesserocr.get_languages() # prints tessdata path and list of available languages + print(tesserocr.tesseract_version()) # print tesseract-ocr version + print(tesserocr.get_languages()) # prints tessdata path and list of available languages image = Image.open('sample.jpg') - print tesserocr.image_to_text(image) # print ocr text from image + print(tesserocr.image_to_text(image)) # print ocr text from image # or - print tesserocr.file_to_text('sample.jpg') + print(tesserocr.file_to_text('sample.jpg')) ``image_to_text`` and ``file_to_text`` can be used with ``threading`` to concurrently process multiple images which is highly efficient. @@ -138,15 +164,15 @@ with PyTessBaseAPI() as api: api.SetImage(image) boxes = api.GetComponentImages(RIL.TEXTLINE, True) - print 'Found {} textline image components.'.format(len(boxes)) + print('Found {} textline image components.'.format(len(boxes))) for i, (im, box, _, _) in enumerate(boxes): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text() conf = api.MeanTextConf() - print (u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, " - "confidence: {1}, text: {2}").format(i, conf, ocrResult, **box) + print(u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, " + "confidence: {1}, text: {2}".format(i, conf, ocrResult, **box)) Orientation and script detection (OSD): ``````````````````````````````````````` @@ -163,10 +189,10 @@ it = api.AnalyseLayout() orientation, direction, order, deskew_angle = it.Orientation() - print "Orientation: {:d}".format(orientation) - print "WritingDirection: {:d}".format(direction) - print "TextlineOrder: {:d}".format(order) - print "Deskew angle: {:.4f}".format(deskew_angle) + print("Orientation: {:d}".format(orientation)) + print("WritingDirection: {:d}".format(direction)) + print("TextlineOrder: {:d}".format(order)) + print("Deskew angle: {:.4f}".format(deskew_angle)) or more simply with ``OSD_ONLY`` page segmentation mode: @@ -178,8 +204,8 @@ api.SetImageFile("/usr/src/tesseract/testing/eurotext.tif") os = api.DetectOS() - print ("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n" - "Script: {script}\nScript confidence: {sconfidence}").format(**os) + print("Orientation: {orientation}\nOrientation confidence: {oconfidence}\n" + "Script: {script}\nScript confidence: {sconfidence}".format(**os)) more human-readable info with tesseract 4+ (demonstrates LSTM engine usage): @@ -191,14 +217,16 @@ api.SetImageFile("/usr/src/tesseract/testing/eurotext.tif") os = api.DetectOrientationScript() - print ("Orientation: {orient_deg}\nOrientation confidence: {orient_conf}\n" - "Script: {script_name}\nScript confidence: {script_conf}").format(**os) + print("Orientation: {orient_deg}\nOrientation confidence: {orient_conf}\n" + "Script: {script_name}\nScript confidence: {script_conf}".format(**os)) Iterator over the classifier choices for a single symbol: ````````````````````````````````````````````````````````` .. code:: python + from __future__ import print_function + from tesserocr import PyTessBaseAPI, RIL, iterate_level with PyTessBaseAPI() as api: @@ -213,17 +241,17 @@ symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) if symbol: - print u'symbol {}, conf: {}'.format(symbol, conf), + print(u'symbol {}, conf: {}'.format(symbol, conf), end='') indent = False ci = r.GetChoiceIterator() for c in ci: if indent: - print '\t\t ', - print '\t- ', + print('\t\t ', end='') + print('\t- ', end='') choice = c.GetUTF8Text() # c == ci - print u'{} conf: {}'.format(choice, c.Confidence()) + print(u'{} conf: {}'.format(choice, c.Confidence())) indent = True - print '---------------------------------------------' + print('---------------------------------------------') Keywords: Tesseract,tesseract-ocr,OCR,optical character recognition,PIL,Pillow,Cython Platform: UNKNOWN @@ -236,11 +264,11 @@ Classifier: Operating System :: POSIX Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.2 -Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Programming Language :: Cython +Description-Content-Type: text/x-rst diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/tesserocr.pyx new/tesserocr-2.5.0/tesserocr.pyx --- old/tesserocr-2.2.2/tesserocr.pyx 2017-07-26 20:34:00.000000000 +0200 +++ new/tesserocr-2.5.0/tesserocr.pyx 2019-11-08 23:49:38.000000000 +0100 @@ -18,7 +18,7 @@ ['eng', 'osd', 'equ']) """ -__version__ = '2.2.2' +__version__ = '2.5.0' import os from io import BytesIO @@ -45,11 +45,17 @@ # default parameters setMsgSeverity(L_SEVERITY_NONE) # suppress leptonica error messages -cdef TessBaseAPI _api = TessBaseAPI() +cdef TessBaseAPI _api _api.SetVariable('debug_file', '/dev/null') # suppress tesseract debug messages _api.Init(NULL, NULL) -cdef _DEFAULT_PATH = abspath(join(_api.GetDatapath(), os.pardir)) + os.sep -cdef _DEFAULT_LANG = _api.GetInitLanguagesAsString() +IF TESSERACT_VERSION >= 0x3999800: + cdef _DEFAULT_PATH = _api.GetDatapath() # "tessdata/" is not appended by tesseract since commit dba13db +ELSE: + cdef _DEFAULT_PATH = abspath(join(_api.GetDatapath(), os.pardir)) + os.sep +_init_lang = _api.GetInitLanguagesAsString() +if _init_lang == '': + _init_lang = 'eng' +cdef _DEFAULT_LANG = _init_lang _api.End() TessBaseAPI.ClearPersistentCache() @@ -61,7 +67,7 @@ cdef class OEM(_Enum): - """An enum that defines avaialble OCR engine modes. + """An enum that defines available OCR engine modes. Attributes: TESSERACT_ONLY: Run Tesseract only - fastest @@ -78,11 +84,12 @@ """ TESSERACT_ONLY = OEM_TESSERACT_ONLY - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: LSTM_ONLY = OEM_LSTM_ONLY TESSERACT_LSTM_COMBINED = OEM_TESSERACT_LSTM_COMBINED - CUBE_ONLY = OEM_CUBE_ONLY - TESSERACT_CUBE_COMBINED = OEM_TESSERACT_CUBE_COMBINED + ELSE: + CUBE_ONLY = OEM_CUBE_ONLY + TESSERACT_CUBE_COMBINED = OEM_TESSERACT_CUBE_COMBINED DEFAULT = OEM_DEFAULT @@ -316,7 +323,7 @@ cdef bytes _image_buffer(image): """Return raw bytes of a PIL Image""" with BytesIO() as f: - image.save(f, image.format or 'JPEG') + image.save(f, image.format or 'PNG') return f.getvalue() @@ -355,7 +362,7 @@ cdef pixa_to_list(Pixa *pixa): """Convert Pixa (Array of pixes and boxes) to list of pix, box tuples.""" - return zip((_pix_to_image(pix) for pix in pixa.pix[:pixa.n]), boxa_to_list(pixa.boxa)) + return list(zip((_pix_to_image(pix) for pix in pixa.pix[:pixa.n]), boxa_to_list(pixa.boxa))) cdef class PyPageIterator: @@ -605,13 +612,16 @@ if pta == NULL: return None try: - return zip((x for x in pta.x[:pta.n]), (y for y in pta.y[:pta.n])) + return list(zip((x for x in pta.x[:pta.n]), (y for y in pta.y[:pta.n]))) finally: - free(pta) + ptaDestroy(&pta) def GetBinaryImage(self, PageIteratorLevel level): """Return a binary image of the current object at the given level. + The image is masked along the polygon outline of the current block, as given + by :meth:`BlockPolygon`. (Pixels outside the mask will be white.) + The position and size match the return from :meth:`BoundingBoxInternal`, and so this could be upscaled with respect to the original input image. @@ -633,6 +643,9 @@ """Return an image of the current object at the given level in greyscale if available in the input. + The image is masked along the polygon outline of the current block, as given + by :meth:`BlockPolygon`. (Pixels outside the mask will be white.) + To guarantee a binary image use :meth:`BinaryImage`. Args: @@ -818,6 +831,21 @@ """ return self._ltrriter.Confidence(level) + IF TESSERACT_VERSION >= 0x3040100: + def RowAttributes(self): + """Return row_height, descenders and ascenders in a dict""" + cdef: + float row_height + float descenders + float ascenders + + self._ltrriter.RowAttributes(&row_height, &descenders, &ascenders) + return { + 'row_height': row_height, + 'descenders': descenders, + 'ascenders': ascenders + } + def WordFontAttributes(self): """Return the font attributes of the current word. @@ -889,6 +917,12 @@ """Return True if the current word was found in a dictionary.""" return self._ltrriter.WordIsFromDictionary() + IF TESSERACT_VERSION >= 0x4000000: + def BlanksBeforeWord(self): + """Return True if the current word is numeric.""" + return self._ltrriter.BlanksBeforeWord() + + def WordIsNumeric(self): """Return True if the current word is numeric.""" return self._ltrriter.WordIsNumeric() @@ -1010,6 +1044,17 @@ """ return self._riter.ParagraphIsLtr() + IF TESSERACT_VERSION >= 0x4000000: + def GetBestLSTMSymbolChoices(self): + LSTMSymbolChoices = [] + output = self._riter.GetBestLSTMSymbolChoices()[0] + for tstep in output: + timestep = [] + for confpair in tstep: + timestep.append((confpair.first, confpair.second)) + LSTMSymbolChoices.append(timestep) + return LSTMSymbolChoices + cdef class PyChoiceIterator: @@ -1108,7 +1153,7 @@ applicable language, and there is more chance of hallucinating incorrect words. psm (int): Page segmentation mode. Defaults to :attr:`PSM.AUTO`. - See :class:`PSM` for avaialble psm values. + See :class:`PSM` for available psm values. init (bool): If ``False``, :meth:`Init` will not be called and has to be called after initialization. oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`. @@ -1227,7 +1272,7 @@ def GetIntVariable(self, name): """Return the value of the given int parameter if it exists among Tesseract parameters. - Returns ``None`` if the paramter was not found. + Returns ``None`` if the parameter was not found. """ cdef: bytes py_name = _b(name) @@ -1239,7 +1284,7 @@ def GetBoolVariable(self, name): """Return the value of the given bool parameter if it exists among Tesseract parameters. - Returns ``None`` if the paramter was not found. + Returns ``None`` if the parameter was not found. """ cdef: bytes py_name = _b(name) @@ -1251,7 +1296,7 @@ def GetDoubleVariable(self, name): """Return the value of the given double parameter if it exists among Tesseract parameters. - Returns ``None`` if the paramter was not found. + Returns ``None`` if the parameter was not found. """ cdef: bytes py_name = _b(name) @@ -1263,7 +1308,7 @@ def GetStringVariable(self, name): """Return the value of the given string parameter if it exists among Tesseract parameters. - Returns ``None`` if the paramter was not found. + Returns ``None`` if the parameter was not found. """ cdef: bytes py_name = _b(name) @@ -1276,13 +1321,13 @@ """Return the value of named variable as a string (regardless of type), if it exists. - Returns ``None`` if paramter was not found. + Returns ``None`` if parameter was not found. """ cdef: bytes py_name = _b(name) STRING val if self._baseapi.GetVariableAsString(py_name, &val): - return val.string() + return val.c_str() return None def InitFull(self, path=_DEFAULT_PATH, lang=_DEFAULT_LANG, @@ -1357,7 +1402,7 @@ OcrEngineMode oem=OEM_DEFAULT): """Initialize the API with the given data path, language and OCR engine mode. - See :meth:`InitFull` for more intialization info and options. + See :meth:`InitFull` for more initialization info and options. Args: path (str): The name of the parent directory of tessdata. @@ -1396,7 +1441,7 @@ """ cdef GenericVector[STRING] langs self._baseapi.GetLoadedLanguagesAsVector(&langs) - return [langs[i].string() for i in xrange(langs.size())] + return [langs[i].c_str() for i in xrange(langs.size())] def GetAvailableLanguages(self): """Return list of available languages in the init data path""" @@ -1405,7 +1450,7 @@ int i langs = [] self._baseapi.GetAvailableLanguagesAsVector(&v) - langs = [v[i].string() for i in xrange(v.size())] + langs = [v[i].c_str() for i in xrange(v.size())] return langs def InitForAnalysePage(self): @@ -1923,12 +1968,12 @@ cdef: bool b bool font_info - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: bool textonly TessResultRenderer *temp TessResultRenderer *renderer = NULL - IF TESSERACT_VERSION >= 0x030401: + IF TESSERACT_VERSION >= 0x3040100: if self._baseapi.GetPageSegMode() == PSM.OSD_ONLY: renderer = new TessOsdRenderer(outputbase) return renderer @@ -1940,7 +1985,7 @@ self._baseapi.GetBoolVariable("tessedit_create_pdf", &b) if b: - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: self._baseapi.GetBoolVariable("textonly_pdf", &textonly) temp = new TessPDFRenderer(outputbase, self._baseapi.GetDatapath(), textonly) ELSE: @@ -2104,6 +2149,25 @@ raise RuntimeError('Failed to recognize. No image set?') return _free_str(text) + IF TESSERACT_VERSION >= 0x4000000: + def GetBestLSTMSymbolChoices(self): + """Return Symbol choices as multi-dimensional array of tupels. The + first dimension contains words. The second dimension contains the LSTM + timesteps of the respective word. They are either accumulated over + characters or pure which depends on the value set in lstm_choice_mode: + 1 = pure; 2 = accumulated. The third dimension contains the symbols + and their probability as tupels for the respective timestep. + Returns an empty list if :meth:`Recognize` was not called first. + """ + if self.GetVariableAsString("lstm_choice_mode") == "0": + raise RuntimeError('lstm_choice_mode Parameter is 0. Set it to 1 or 2') + words = [] + wi = self.GetIterator() + if wi: + for w in iterate_level(wi, RIL.WORD): + words.append(w.GetBestLSTMSymbolChoices()) + return words + def GetHOCRText(self, int page_number): """Return a HTML-formatted string with hOCR markup from the internal data structures. @@ -2120,7 +2184,7 @@ raise RuntimeError('Failed to recognize. No image set?') return _free_str(text) - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: def GetTSVText(self, int page_number): """Make a TSV-formatted string from the internal data structures. @@ -2168,7 +2232,7 @@ raise RuntimeError('Failed to recognize. No image set?') return _free_str(text) - IF TESSERACT_VERSION >= 0x040000: + IF TESSERACT_VERSION >= 0x3999800: def DetectOrientationScript(self): """Detect the orientation of the input image and apparent script (alphabet). @@ -2295,6 +2359,7 @@ rotation to be applied to the page for the text to be upright and readable. - oconfidence: Orientation confidence. - script: Index of the script with the highest score for this orientation. + (This is _not_ the index of :meth:`get_languages`, which is in alphabetical order.) - sconfidence: script confidence. """ cdef OSResults results @@ -2366,7 +2431,7 @@ path (str): The name of the parent directory of tessdata. Must end in /. oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`. - see :class:`OEM` for all avaialble oem options. + see :class:`OEM` for all available oem options. Returns: unicode: The text extracted from the image. @@ -2416,7 +2481,7 @@ path (str): The name of the parent directory of tessdata. Must end in /. oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`. - see :class:`OEM` for all avaialble oem options. + see :class:`OEM` for all available oem options. Returns: unicode: The text extracted from the image. @@ -2464,7 +2529,7 @@ Must end in /. Default tesseract-ocr datapath is used if no path is provided. - Retruns + Returns tuple: Tuple with two elements: - path (str): tessdata parent directory path - languages (list): list of available languages as ISO 639-3 strings. @@ -2477,6 +2542,6 @@ baseapi.Init(py_path, NULL) path = baseapi.GetDatapath() baseapi.GetAvailableLanguagesAsVector(&v) - langs = [v[i].string() for i in xrange(v.size())] + langs = [v[i].c_str() for i in xrange(v.size())] baseapi.End() return path, langs diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/tesserocr_experiment.pyx new/tesserocr-2.5.0/tesserocr_experiment.pyx --- old/tesserocr-2.2.2/tesserocr_experiment.pyx 2016-08-22 15:27:56.000000000 +0200 +++ new/tesserocr-2.5.0/tesserocr_experiment.pyx 2019-11-08 23:49:38.000000000 +0100 @@ -1,4 +1,4 @@ -# An attemp to address the PIL.Image buffer directly without copying it. +# An attempt to address the PIL.Image buffer directly without copying it. # # This is achieved by extracting the buffer ptr from Image.im.unsafe_ptrs # the xsize, ysize, pixelsize and linesize are extracted as well to be used diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.2.2/tests/test_api.py new/tesserocr-2.5.0/tests/test_api.py --- old/tesserocr-2.2.2/tests/test_api.py 2017-06-20 18:08:20.000000000 +0200 +++ new/tesserocr-2.5.0/tests/test_api.py 2019-11-08 23:49:38.000000000 +0100 @@ -10,15 +10,42 @@ def version_to_int(version): + subversion = None + subtrahend = 0 + # Subtracts a certain amount from the version number to differentiate between + # alpha, beta and release versions. + if "alpha" in version: + version_split = version.split("alpha") + subversion = version_split[1] + subtrahend = 2 + elif "beta" in version: + version_split = version.split("beta") + subversion = version_split[1] + subtrahend = 1 version = re.search(r'((?:\d+\.)+\d+)', version).group() - return int(''.join(version.split('.')), 16) + # Split the groups on ".", take only the first one, and print each group with leading 0 if needed + # To be safe, also handle cases where an extra group is added to the version string, or if one or two groups + # are dropped. + version_groups = (version.split('.') + [0, 0])[:3] + version_str = "{:02}{:02}{:02}".format(*map(int, version_groups)) + version_str = str((int(version_str, 10) - subtrahend)) + # Adds a 2 digit subversion number for the subversionrelease. + subversion_str = "00" + if subversion is not None and subversion is not "": + subversion = re.search(r'(?:\d+)', subversion).group() + subversion_groups = (subversion.split('-') + [0, 0])[:1] + subversion_str = "{:02}".format(*map(int, subversion_groups)) + version_str += subversion_str + return int(version_str, 16) + + +_TESSERACT_VERSION = version_to_int(tesserocr.PyTessBaseAPI.Version()) class TestTessBaseApi(unittest.TestCase): _test_dir = os.path.abspath(os.path.dirname(__file__)) _image_file = os.path.join(_test_dir, 'eurotext.tif') - _tesseract_version = version_to_int(tesserocr.PyTessBaseAPI.Version()) def setUp(self): if pil_installed: @@ -108,7 +135,10 @@ path = self._api.GetDatapath() self._api.End() self.assertRaises(RuntimeError, self._api.Init, path=(self._test_dir + os.path.sep)) # no tessdata - new_path = os.path.abspath(os.path.join(path, os.path.pardir)) + os.path.sep + if _TESSERACT_VERSION >= 0x3999800: + new_path = path + else: + new_path = os.path.abspath(os.path.join(path, os.path.pardir)) + os.path.sep self._api.End() self._api.Init(new_path) self.assertEqual(self._api.GetDatapath(), path) @@ -148,6 +178,46 @@ self.assertEqual([v[0] for v in mapped_confidences], words) self.assertEqual([v[1] for v in mapped_confidences], confidences) + @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4") + def test_LSTM_choices(self): + """Test GetBestLSTMSymbolChoices.""" + self._api.SetVariable("lstm_choice_mode", "2") + self._api.SetImageFile(self._image_file) + self._api.Recognize() + LSTM_choices = self._api.GetBestLSTMSymbolChoices() + words = self._api.AllWords() + self.assertEqual(len(words), len(LSTM_choices)) + + for choice, word in zip(LSTM_choices, words): + chosen_word = "" + for timestep in choice: + for alternative in timestep: + self.assertGreaterEqual(alternative[1], 0.0) + self.assertLessEqual(alternative[1], 2.0) + chosen_symbol = timestep[0][0] + if chosen_symbol != " ": + chosen_word += chosen_symbol + self.assertEqual(chosen_word, word) + + @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4") + def test_result_iterator(self): + """Test result iterator.""" + self._api.SetImageFile(self._image_file) + self._api.Recognize() + it = self._api.GetIterator() + level = tesserocr.RIL.WORD + for i, w in enumerate(tesserocr.iterate_level(it, level)): + text = w.GetUTF8Text(level) + blanks = w.BlanksBeforeWord() + if i == 0: + self.assertEqual(text, "The") + self.assertEqual(blanks, 0) + elif i == 1: + self.assertEqual(text, "(quick)") + self.assertEqual(blanks, 1) + else: + break + def test_detect_os(self): """Test DetectOS and DetectOrientationScript (tesseract v4+).""" self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY) @@ -155,8 +225,11 @@ orientation = self._api.DetectOS() all(self.assertIn(k, orientation) for k in ['sconfidence', 'oconfidence', 'script', 'orientation']) self.assertEqual(orientation['orientation'], 0) - self.assertEqual(orientation['script'], 1) - if self._tesseract_version >= 0x040000: + languages = tesserocr.get_languages()[1] # this is sorted alphabetically! + self.assertLess(orientation['script'], len(languages)) + script_name = languages[orientation['script']] # therefore does not work + #self.assertEqual(script_name, 'Latin') # cannot test: not reliable + if _TESSERACT_VERSION >= 0x3999800: orientation = self._api.DetectOrientationScript() all(self.assertIn(k, orientation) for k in ['orient_deg', 'orient_conf', 'script_name', 'script_conf']) self.assertEqual(orientation['orient_deg'], 0) @@ -193,6 +266,51 @@ # Test if empty self.assertFalse(result) + def test_layout_getcomponents(self): + self._api.Init() + self._api.SetImageFile(self._image_file) + result = self._api.GetComponentImages(tesserocr.RIL.BLOCK, True) + # Test if not empty + self.assertTrue(result) + _, xywh, _, _ = result[0] # bbox of largest + self.assertIn('w', xywh) + self.assertIn('h', xywh) + area = xywh['w'] * xywh['h'] + # Test if the largest block is quite large + self.assertGreater(area, 400000) + + def test_layout_boundingbox(self): + self._api.Init() + self._api.SetImageFile(self._image_file) + layout = self._api.AnalyseLayout() + # Test if not empty + self.assertTrue(layout) + self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK)) + result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest + self.assertIsNot(result, None) + x0, y0, x1, y1 = result + area = (x1 - x0) * (y1 - y0) + # Test if the largest block is quite large + self.assertGreater(area, 400000) + + def test_layout_blockpolygon(self): + self._api.Init() + self._api.SetImageFile(self._image_file) + layout = self._api.AnalyseLayout() + # Test if not empty + self.assertTrue(layout) + self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK)) + result = layout.BlockPolygon() # polygon of largest + # Test if not empty + self.assertIsNot(result, None) + # Test there are at least 4 contour points + self.assertGreaterEqual(len(result), 4) + xs, ys = zip(*result) + x0, y0, x1, y1 = min(xs), min(ys), max(xs), max(ys) + area = (x1 - x0) * (y1 - y0) + # Test if the largest block is quite large + self.assertGreater(area, 400000) + def test_recognize(self): """Test Recognize with and without timeout.""" self._api.SetImageFile(self._image_file) @@ -208,6 +326,16 @@ res = self._api.Recognize() self.assertTrue(res) + @unittest.skipIf(_TESSERACT_VERSION < 0x3040100, "tesseract < 4") + def test_row_attributes(self): + self._api.SetImageFile(self._image_file) + self._api.Recognize() + it = self._api.GetIterator() + attrs = it.RowAttributes() + self.assertIsInstance(attrs['row_height'], float) + self.assertIsInstance(attrs['ascenders'], float) + self.assertIsInstance(attrs['descenders'], float) + if __name__ == '__main__': unittest.main()
