commit python-tesserocr for openSUSE:Factory

root Thu, 06 Dec 2018 03:19:14 -0800

Hello community,

here is the log from the commit of package python-tesserocr for 
openSUSE:Factory checked in at 2018-12-06 12:18:51
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-tesserocr (Old)
 and      /work/SRC/openSUSE:Factory/.python-tesserocr.new.19453 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-tesserocr"

Thu Dec  6 12:18:51 2018 rev:5 rq:655446 version:2.4.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-tesserocr/python-tesserocr.changes        
2018-08-15 10:37:40.500212953 +0200
+++ 
/work/SRC/openSUSE:Factory/.python-tesserocr.new.19453/python-tesserocr.changes 
    2018-12-06 12:18:55.481416268 +0100
@@ -1,0 +2,9 @@
+Wed Dec  5 23:35:00 UTC 2018 - Martin Herkt <[email protected]>
+
+- Update to version 2.4.0
+  Tesseract v4 new API methods supported:
+
+  * GetBestLSTMSymbolChoices
+  * BlanWksBeforeWord
+
+-------------------------------------------------------------------

Old:
----
  tesserocr-2.3.1.tar.gz

New:
----
  tesserocr-2.4.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-tesserocr.spec ++++++
--- /var/tmp/diff_new_pack.BfgL89/_old  2018-12-06 12:18:56.265415426 +0100
+++ /var/tmp/diff_new_pack.BfgL89/_new  2018-12-06 12:18:56.265415426 +0100
@@ -12,13 +12,13 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via https://bugs.opensuse.org/
 #
 
 
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 Name:           python-tesserocr
-Version:        2.3.1
+Version:        2.4.0
 Release:        0
 Summary:        A simple, Pillow-friendly, Python wrapper around tesseract-ocr
 License:        MIT

++++++ tesserocr-2.3.1.tar.gz -> tesserocr-2.4.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tesserocr-2.3.1/PKG-INFO new/tesserocr-2.4.0/PKG-INFO
--- old/tesserocr-2.3.1/PKG-INFO        2018-08-13 19:35:30.000000000 +0200
+++ new/tesserocr-2.4.0/PKG-INFO        2018-12-05 15:37:32.000000000 +0100
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tesserocr
-Version: 2.3.1
+Version: 2.4.0
 Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API 
using Cython
 Home-page: https://github.com/sirfz/tesserocr
 Author: Fayez Zouheiry
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tesserocr-2.3.1/setup.py new/tesserocr-2.4.0/setup.py
--- old/tesserocr-2.3.1/setup.py        2018-08-13 17:58:16.000000000 +0200
+++ new/tesserocr-2.4.0/setup.py        2018-11-30 15:43:23.000000000 +0100
@@ -48,12 +48,32 @@
 
 
 def version_to_int(version):
+    subversion = None
+    subtrahend = 0
+    # Subtracts a certain amount from the version number to differentiate 
between
+    # alpha, beta and release versions.
+    if "alpha" in version:
+        version_split = version.split("alpha")
+        subversion = version_split[1]
+        subtrahend = 2
+    elif "beta" in version:
+        version_split = version.split("beta")
+        subversion = version_split[1]
+        subtrahend = 1
     version = re.search(r'((?:\d+\.)+\d+)', version).group()
     # Split the groups on ".", take only the first one, and print each group 
with leading 0 if needed
     # To be safe, also handle cases where an extra group is added to the 
version string, or if one or two groups
     # are dropped.
     version_groups = (version.split('.') + [0, 0])[:3]
     version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
+    version_str = str((int(version_str, 10)-subtrahend))
+    # Adds a 2 digit subversion number for the subversionrelease.
+    subversion_str="00"
+    if subversion is not None and subversion is not "":
+        subversion = re.search(r'(?:\d+)', subversion).group()
+        subversion_groups = (subversion.split('-') + [0, 0])[:1]
+        subversion_str = "{:02}".format(*map(int, subversion_groups))
+    version_str+=subversion_str
     return int(version_str, 16)
 
 
@@ -132,7 +152,7 @@
             _LOGGER.warn('pkg-config failed to find tesseract/lept libraries: 
{}'.format(e))
         build_args = get_tesseract_version()
 
-    if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x030502:
+    if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x3050200:
         _LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler support')
         build_args['extra_compile_args'] = ['-std=c++11', 
'-DUSE_STD_NAMESPACE']
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tesserocr-2.3.1/tesseract.pxd 
new/tesserocr-2.4.0/tesseract.pxd
--- old/tesserocr-2.3.1/tesseract.pxd   2018-02-04 20:28:35.000000000 +0100
+++ new/tesserocr-2.4.0/tesseract.pxd   2018-11-30 16:01:37.000000000 +0100
@@ -1,5 +1,8 @@
 from libcpp cimport bool
+from libcpp.pair cimport pair
+from libcpp.vector cimport vector
 ctypedef const char cchar_t
+ctypedef const char * cchar_tp
 ctypedef const unsigned char cuchar_t
 
 cdef extern from "leptonica/allheaders.h" nogil:
@@ -139,27 +142,51 @@
         void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int 
*) const
 
 cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil:
-    cdef cppclass LTRResultIterator(PageIterator):
-        char *GetUTF8Text(PageIteratorLevel) const
-        void SetLineSeparator(cchar_t *)
-        void SetParagraphSeparator(cchar_t *)
-        float Confidence(PageIteratorLevel) const
-        cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, 
bool *, int *, int *) const
-        cchar_t *WordRecognitionLanguage() const
-        StrongScriptDirection WordDirection() const
-        bool WordIsFromDictionary() const
-        bool WordIsNumeric() const
-        bool HasBlamerInfo() const
-        cchar_t *GetBlamerDebug() const
-        cchar_t *GetBlamerMisadaptionDebug() const
-        bool HasTruthString() const
-        bool EquivalentToTruth(cchar_t *) const
-        char *WordTruthUTF8Text() const
-        char *WordNormedUTF8Text() const
-        cchar_t *WordLattice(int *) const
-        bool SymbolIsSuperscript() const
-        bool SymbolIsSubscript() const
-        bool SymbolIsDropcap() const
+    IF TESSERACT_VERSION >= 0x4000000:
+        cdef cppclass LTRResultIterator(PageIterator):
+            char *GetUTF8Text(PageIteratorLevel) const
+            void SetLineSeparator(cchar_t *)
+            void SetParagraphSeparator(cchar_t *)
+            float Confidence(PageIteratorLevel) const
+            cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool 
*, bool *, int *, int *) const
+            cchar_t *WordRecognitionLanguage() const
+            StrongScriptDirection WordDirection() const
+            bool WordIsFromDictionary() const
+            int BlanksBeforeWord() const
+            bool WordIsNumeric() const
+            bool HasBlamerInfo() const
+            cchar_t *GetBlamerDebug() const
+            cchar_t *GetBlamerMisadaptionDebug() const
+            bool HasTruthString() const
+            bool EquivalentToTruth(cchar_t *) const
+            char *WordTruthUTF8Text() const
+            char *WordNormedUTF8Text() const
+            cchar_t *WordLattice(int *) const
+            bool SymbolIsSuperscript() const
+            bool SymbolIsSubscript() const
+            bool SymbolIsDropcap() const
+    ELSE:
+        cdef cppclass LTRResultIterator(PageIterator):
+            char *GetUTF8Text(PageIteratorLevel) const
+            void SetLineSeparator(cchar_t *)
+            void SetParagraphSeparator(cchar_t *)
+            float Confidence(PageIteratorLevel) const
+            cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool 
*, bool *, int *, int *) const
+            cchar_t *WordRecognitionLanguage() const
+            StrongScriptDirection WordDirection() const
+            bool WordIsFromDictionary() const
+            bool WordIsNumeric() const
+            bool HasBlamerInfo() const
+            cchar_t *GetBlamerDebug() const
+            cchar_t *GetBlamerMisadaptionDebug() const
+            bool HasTruthString() const
+            bool EquivalentToTruth(cchar_t *) const
+            char *WordTruthUTF8Text() const
+            char *WordNormedUTF8Text() const
+            cchar_t *WordLattice(int *) const
+            bool SymbolIsSuperscript() const
+            bool SymbolIsSubscript() const
+            bool SymbolIsDropcap() const
 
     cdef cppclass ChoiceIterator:
         ChoiceIterator(const LTRResultIterator &) except +
@@ -168,8 +195,13 @@
         float Confidence() const
 
 cdef extern from "tesseract/resultiterator.h" namespace "tesseract" nogil:
-    cdef cppclass ResultIterator(LTRResultIterator):
-        bool ParagraphIsLtr() const
+    IF TESSERACT_VERSION >= 0x4000000:
+        cdef cppclass ResultIterator(LTRResultIterator):
+            bool ParagraphIsLtr() const
+            vector[vector[pair[cchar_tp, float]]] *GetBestLSTMSymbolChoices() 
const
+    ELSE:
+        cdef cppclass ResultIterator(LTRResultIterator):
+            bool ParagraphIsLtr() const
 
 cdef extern from "tesseract/renderer.h" namespace "tesseract" nogil:
     cdef cppclass TessResultRenderer:
@@ -181,7 +213,7 @@
     cdef cppclass TessHOcrRenderer(TessResultRenderer):
         TessHOcrRenderer(cchar_t *, bool) except +
 
-    IF TESSERACT_VERSION >= 0x040000:
+    IF TESSERACT_VERSION >= 0x3999800:
         cdef cppclass TessPDFRenderer(TessResultRenderer):
             TessPDFRenderer(cchar_t *, cchar_t *, bool) except +
     ELSE:
@@ -194,7 +226,7 @@
     cdef cppclass TessBoxTextRenderer(TessResultRenderer):
         TessBoxTextRenderer(cchar_t *) except +
 
-    IF TESSERACT_VERSION >= 0x030401:
+    IF TESSERACT_VERSION >= 0x3040100:
         cdef cppclass TessOsdRenderer(TessResultRenderer):
             TessOsdRenderer(cchar_t *) except +
 
@@ -213,7 +245,7 @@
 
 cdef extern from "tesseract/baseapi.h" namespace "tesseract" nogil:
 
-    IF TESSERACT_VERSION >= 0x040000:
+    IF TESSERACT_VERSION >= 0x3999800:
         cdef enum OcrEngineMode:
             OEM_TESSERACT_ONLY
             OEM_LSTM_ONLY
@@ -253,7 +285,7 @@
         RIL_WORD,      # within a textline.
         RIL_SYMBOL     # character within a word.
 
-    IF TESSERACT_VERSION >= 0x040000:
+    IF TESSERACT_VERSION >= 0x3999800:
         cdef cppclass TessBaseAPI:
             TessBaseAPI() except +
             @staticmethod
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tesserocr-2.3.1/tesserocr.egg-info/PKG-INFO 
new/tesserocr-2.4.0/tesserocr.egg-info/PKG-INFO
--- old/tesserocr-2.3.1/tesserocr.egg-info/PKG-INFO     2018-08-13 
19:35:29.000000000 +0200
+++ new/tesserocr-2.4.0/tesserocr.egg-info/PKG-INFO     2018-12-05 
15:37:31.000000000 +0100
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tesserocr
-Version: 2.3.1
+Version: 2.4.0
 Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API 
using Cython
 Home-page: https://github.com/sirfz/tesserocr
 Author: Fayez Zouheiry
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tesserocr-2.3.1/tesserocr.pyx 
new/tesserocr-2.4.0/tesserocr.pyx
--- old/tesserocr-2.3.1/tesserocr.pyx   2018-08-13 19:35:16.000000000 +0200
+++ new/tesserocr-2.4.0/tesserocr.pyx   2018-11-30 16:04:36.000000000 +0100
@@ -18,7 +18,7 @@
  ['eng', 'osd', 'equ'])
 """
 
-__version__ = '2.3.1'
+__version__ = '2.4.0'
 
 import os
 from io import BytesIO
@@ -48,7 +48,7 @@
 cdef TessBaseAPI _api = TessBaseAPI()
 _api.SetVariable('debug_file', '/dev/null')  # suppress tesseract debug 
messages
 _api.Init(NULL, NULL)
-IF TESSERACT_VERSION >= 0x040000:
+IF TESSERACT_VERSION >= 0x3999800:
     cdef _DEFAULT_PATH = _api.GetDatapath()  # "tessdata/" is not appended by 
tesseract since commit dba13db
 ELSE:
     cdef _DEFAULT_PATH = abspath(join(_api.GetDatapath(), os.pardir)) + os.sep
@@ -84,7 +84,7 @@
     """
 
     TESSERACT_ONLY = OEM_TESSERACT_ONLY
-    IF TESSERACT_VERSION >= 0x040000:
+    IF TESSERACT_VERSION >= 0x3999800:
         LSTM_ONLY = OEM_LSTM_ONLY
         TESSERACT_LSTM_COMBINED = OEM_TESSERACT_LSTM_COMBINED
     ELSE:
@@ -896,6 +896,12 @@
         """Return True if the current word was found in a dictionary."""
         return self._ltrriter.WordIsFromDictionary()
 
+    IF TESSERACT_VERSION >= 0x4000000:
+        def BlanksBeforeWord(self):
+            """Return True if the current word is numeric."""
+            return self._ltrriter.BlanksBeforeWord()
+
+
     def WordIsNumeric(self):
         """Return True if the current word is numeric."""
         return self._ltrriter.WordIsNumeric()
@@ -1017,6 +1023,17 @@
         """
         return self._riter.ParagraphIsLtr()
 
+    IF TESSERACT_VERSION >= 0x4000000:
+        def GetBestLSTMSymbolChoices(self):
+            LSTMSymbolChoices = []
+            output = self._riter.GetBestLSTMSymbolChoices()[0]
+            for tstep in output:
+                timestep = []
+                for confpair in tstep:
+                    timestep.append((confpair.first, confpair.second))
+                LSTMSymbolChoices.append(timestep)
+            return LSTMSymbolChoices
+
 
 cdef class PyChoiceIterator:
 
@@ -1930,12 +1947,12 @@
         cdef:
             bool b
             bool font_info
-            IF TESSERACT_VERSION >= 0x040000:
+            IF TESSERACT_VERSION >= 0x3999800:
                 bool textonly
             TessResultRenderer *temp
             TessResultRenderer *renderer = NULL
 
-        IF TESSERACT_VERSION >= 0x030401:
+        IF TESSERACT_VERSION >= 0x3040100:
             if self._baseapi.GetPageSegMode() == PSM.OSD_ONLY:
                 renderer = new TessOsdRenderer(outputbase)
                 return renderer
@@ -1947,7 +1964,7 @@
 
         self._baseapi.GetBoolVariable("tessedit_create_pdf", &b)
         if b:
-            IF TESSERACT_VERSION >= 0x040000:
+            IF TESSERACT_VERSION >= 0x3999800:
                 self._baseapi.GetBoolVariable("textonly_pdf", &textonly)
                 temp = new TessPDFRenderer(outputbase, 
self._baseapi.GetDatapath(), textonly)
             ELSE:
@@ -2111,6 +2128,25 @@
                     raise RuntimeError('Failed to recognize. No image set?')
         return _free_str(text)
 
+    IF TESSERACT_VERSION >= 0x4000000:
+        def GetBestLSTMSymbolChoices(self):
+            """Return Symbol choices as multi-dimensional array of tupels. The
+            first dimension contains words. The second dimension contains the 
LSTM
+            timesteps of the respective word. They are either accumulated over
+            characters or pure which depends on the value set in 
lstm_choice_mode:
+            1 = pure; 2 = accumulated. The third dimension contains the symbols
+            and their probability as tupels for the respective timestep.
+            Returns an empty list if :meth:`Recognize` was not called first.
+            """
+            if self.GetVariableAsString("lstm_choice_mode") == "0":
+                raise RuntimeError('lstm_choice_mode Parameter is 0. Set it to 
1 or 2')
+            words = []
+            wi = self.GetIterator()
+            if wi:
+                for w in iterate_level(wi, RIL.WORD):
+                    words.append(w.GetBestLSTMSymbolChoices())
+            return words
+
     def GetHOCRText(self, int page_number):
         """Return a HTML-formatted string with hOCR markup from the internal
         data structures.
@@ -2127,7 +2163,7 @@
                     raise RuntimeError('Failed to recognize. No image set?')
         return _free_str(text)
 
-    IF TESSERACT_VERSION >= 0x040000:
+    IF TESSERACT_VERSION >= 0x3999800:
         def GetTSVText(self, int page_number):
             """Make a TSV-formatted string from the internal data structures.
 
@@ -2175,7 +2211,7 @@
                     raise RuntimeError('Failed to recognize. No image set?')
         return _free_str(text)
 
-    IF TESSERACT_VERSION >= 0x040000:
+    IF TESSERACT_VERSION >= 0x3999800:
         def DetectOrientationScript(self):
             """Detect the orientation of the input image and apparent script 
(alphabet).
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tesserocr-2.3.1/tests/test_api.py 
new/tesserocr-2.4.0/tests/test_api.py
--- old/tesserocr-2.3.1/tests/test_api.py       2018-08-13 18:09:30.000000000 
+0200
+++ new/tesserocr-2.4.0/tests/test_api.py       2018-11-30 16:23:48.000000000 
+0100
@@ -10,12 +10,32 @@
 
 
 def version_to_int(version):
+    subversion = None
+    subtrahend = 0
+    # Subtracts a certain amount from the version number to differentiate 
between
+    # alpha, beta and release versions.
+    if "alpha" in version:
+        version_split = version.split("alpha")
+        subversion = version_split[1]
+        subtrahend = 2
+    elif "beta" in version:
+        version_split = version.split("beta")
+        subversion = version_split[1]
+        subtrahend = 1
     version = re.search(r'((?:\d+\.)+\d+)', version).group()
     # Split the groups on ".", take only the first one, and print each group 
with leading 0 if needed
     # To be safe, also handle cases where an extra group is added to the 
version string, or if one or two groups
     # are dropped.
     version_groups = (version.split('.') + [0, 0])[:3]
     version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
+    version_str = str((int(version_str, 10) - subtrahend))
+    # Adds a 2 digit subversion number for the subversionrelease.
+    subversion_str = "00"
+    if subversion is not None and subversion is not "":
+        subversion = re.search(r'(?:\d+)', subversion).group()
+        subversion_groups = (subversion.split('-') + [0, 0])[:1]
+        subversion_str = "{:02}".format(*map(int, subversion_groups))
+    version_str += subversion_str
     return int(version_str, 16)
 
 
@@ -115,7 +135,7 @@
         path = self._api.GetDatapath()
         self._api.End()
         self.assertRaises(RuntimeError, self._api.Init, path=(self._test_dir + 
os.path.sep))  # no tessdata
-        if _TESSERACT_VERSION >= 0x040000:
+        if _TESSERACT_VERSION >= 0x3999800:
             new_path = path
         else:
             new_path = os.path.abspath(os.path.join(path, os.path.pardir)) + 
os.path.sep
@@ -158,6 +178,46 @@
         self.assertEqual([v[0] for v in mapped_confidences], words)
         self.assertEqual([v[1] for v in mapped_confidences], confidences)
 
+    @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
+    def test_LSTM_choices(self):
+        """Test GetBestLSTMSymbolChoices."""
+        self._api.SetVariable("lstm_choice_mode", "2")
+        self._api.SetImageFile(self._image_file)
+        self._api.Recognize()
+        LSTM_choices = self._api.GetBestLSTMSymbolChoices()
+        words = self._api.AllWords()
+        self.assertEqual(len(words), len(LSTM_choices))
+
+        for choice, word in zip(LSTM_choices, words):
+            chosen_word = ""
+            for timestep in choice:
+                for alternative in timestep:
+                    self.assertGreaterEqual(alternative[1], 0.0)
+                    self.assertLessEqual(alternative[1], 2.0)
+                chosen_symbol = timestep[0][0]
+                if chosen_symbol != " ":
+                    chosen_word += chosen_symbol
+            self.assertEqual(chosen_word, word)
+
+    @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
+    def test_result_iterator(self):
+        """Test result iterator."""
+        self._api.SetImageFile(self._image_file)
+        self._api.Recognize()
+        it = self._api.GetIterator()
+        level = tesserocr.RIL.WORD
+        for i, w in enumerate(tesserocr.iterate_level(it, level)):
+            text = w.GetUTF8Text(level)
+            blanks = w.BlanksBeforeWord()
+            if i == 0:
+                self.assertEqual(text, "The")
+                self.assertEqual(blanks, 0)
+            elif i == 1:
+                self.assertEqual(text, "(quick)")
+                self.assertEqual(blanks, 1)
+            else:
+                break
+
     def test_detect_os(self):
         """Test DetectOS and DetectOrientationScript (tesseract v4+)."""
         self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY)
@@ -166,7 +226,7 @@
         all(self.assertIn(k, orientation) for k in ['sconfidence', 
'oconfidence', 'script', 'orientation'])
         self.assertEqual(orientation['orientation'], 0)
         self.assertEqual(orientation['script'], 1)
-        if _TESSERACT_VERSION >= 0x040000:
+        if _TESSERACT_VERSION >= 0x3999800:
             orientation = self._api.DetectOrientationScript()
             all(self.assertIn(k, orientation) for k in ['orient_deg', 
'orient_conf', 'script_name', 'script_conf'])
             self.assertEqual(orientation['orient_deg'], 0)

commit python-tesserocr for openSUSE:Factory

Reply via email to