Hello community,
here is the log from the commit of package python-tesserocr for
openSUSE:Factory checked in at 2018-12-06 12:18:51
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-tesserocr (Old)
and /work/SRC/openSUSE:Factory/.python-tesserocr.new.19453 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-tesserocr"
Thu Dec 6 12:18:51 2018 rev:5 rq:655446 version:2.4.0
Changes:
--------
--- /work/SRC/openSUSE:Factory/python-tesserocr/python-tesserocr.changes
2018-08-15 10:37:40.500212953 +0200
+++
/work/SRC/openSUSE:Factory/.python-tesserocr.new.19453/python-tesserocr.changes
2018-12-06 12:18:55.481416268 +0100
@@ -1,0 +2,9 @@
+Wed Dec 5 23:35:00 UTC 2018 - Martin Herkt <[email protected]>
+
+- Update to version 2.4.0
+ Tesseract v4 new API methods supported:
+
+ * GetBestLSTMSymbolChoices
+ * BlanWksBeforeWord
+
+-------------------------------------------------------------------
Old:
----
tesserocr-2.3.1.tar.gz
New:
----
tesserocr-2.4.0.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-tesserocr.spec ++++++
--- /var/tmp/diff_new_pack.BfgL89/_old 2018-12-06 12:18:56.265415426 +0100
+++ /var/tmp/diff_new_pack.BfgL89/_new 2018-12-06 12:18:56.265415426 +0100
@@ -12,13 +12,13 @@
# license that conforms to the Open Source Definition (Version 1.9)
# published by the Open Source Initiative.
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via https://bugs.opensuse.org/
#
%{?!python_module:%define python_module() python-%{**} python3-%{**}}
Name: python-tesserocr
-Version: 2.3.1
+Version: 2.4.0
Release: 0
Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr
License: MIT
++++++ tesserocr-2.3.1.tar.gz -> tesserocr-2.4.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.3.1/PKG-INFO new/tesserocr-2.4.0/PKG-INFO
--- old/tesserocr-2.3.1/PKG-INFO 2018-08-13 19:35:30.000000000 +0200
+++ new/tesserocr-2.4.0/PKG-INFO 2018-12-05 15:37:32.000000000 +0100
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: tesserocr
-Version: 2.3.1
+Version: 2.4.0
Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API
using Cython
Home-page: https://github.com/sirfz/tesserocr
Author: Fayez Zouheiry
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.3.1/setup.py new/tesserocr-2.4.0/setup.py
--- old/tesserocr-2.3.1/setup.py 2018-08-13 17:58:16.000000000 +0200
+++ new/tesserocr-2.4.0/setup.py 2018-11-30 15:43:23.000000000 +0100
@@ -48,12 +48,32 @@
def version_to_int(version):
+ subversion = None
+ subtrahend = 0
+ # Subtracts a certain amount from the version number to differentiate
between
+ # alpha, beta and release versions.
+ if "alpha" in version:
+ version_split = version.split("alpha")
+ subversion = version_split[1]
+ subtrahend = 2
+ elif "beta" in version:
+ version_split = version.split("beta")
+ subversion = version_split[1]
+ subtrahend = 1
version = re.search(r'((?:\d+\.)+\d+)', version).group()
# Split the groups on ".", take only the first one, and print each group
with leading 0 if needed
# To be safe, also handle cases where an extra group is added to the
version string, or if one or two groups
# are dropped.
version_groups = (version.split('.') + [0, 0])[:3]
version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
+ version_str = str((int(version_str, 10)-subtrahend))
+ # Adds a 2 digit subversion number for the subversionrelease.
+ subversion_str="00"
+ if subversion is not None and subversion is not "":
+ subversion = re.search(r'(?:\d+)', subversion).group()
+ subversion_groups = (subversion.split('-') + [0, 0])[:1]
+ subversion_str = "{:02}".format(*map(int, subversion_groups))
+ version_str+=subversion_str
return int(version_str, 16)
@@ -132,7 +152,7 @@
_LOGGER.warn('pkg-config failed to find tesseract/lept libraries:
{}'.format(e))
build_args = get_tesseract_version()
- if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x030502:
+ if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x3050200:
_LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler support')
build_args['extra_compile_args'] = ['-std=c++11',
'-DUSE_STD_NAMESPACE']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.3.1/tesseract.pxd
new/tesserocr-2.4.0/tesseract.pxd
--- old/tesserocr-2.3.1/tesseract.pxd 2018-02-04 20:28:35.000000000 +0100
+++ new/tesserocr-2.4.0/tesseract.pxd 2018-11-30 16:01:37.000000000 +0100
@@ -1,5 +1,8 @@
from libcpp cimport bool
+from libcpp.pair cimport pair
+from libcpp.vector cimport vector
ctypedef const char cchar_t
+ctypedef const char * cchar_tp
ctypedef const unsigned char cuchar_t
cdef extern from "leptonica/allheaders.h" nogil:
@@ -139,27 +142,51 @@
void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int
*) const
cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil:
- cdef cppclass LTRResultIterator(PageIterator):
- char *GetUTF8Text(PageIteratorLevel) const
- void SetLineSeparator(cchar_t *)
- void SetParagraphSeparator(cchar_t *)
- float Confidence(PageIteratorLevel) const
- cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *,
bool *, int *, int *) const
- cchar_t *WordRecognitionLanguage() const
- StrongScriptDirection WordDirection() const
- bool WordIsFromDictionary() const
- bool WordIsNumeric() const
- bool HasBlamerInfo() const
- cchar_t *GetBlamerDebug() const
- cchar_t *GetBlamerMisadaptionDebug() const
- bool HasTruthString() const
- bool EquivalentToTruth(cchar_t *) const
- char *WordTruthUTF8Text() const
- char *WordNormedUTF8Text() const
- cchar_t *WordLattice(int *) const
- bool SymbolIsSuperscript() const
- bool SymbolIsSubscript() const
- bool SymbolIsDropcap() const
+ IF TESSERACT_VERSION >= 0x4000000:
+ cdef cppclass LTRResultIterator(PageIterator):
+ char *GetUTF8Text(PageIteratorLevel) const
+ void SetLineSeparator(cchar_t *)
+ void SetParagraphSeparator(cchar_t *)
+ float Confidence(PageIteratorLevel) const
+ cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool
*, bool *, int *, int *) const
+ cchar_t *WordRecognitionLanguage() const
+ StrongScriptDirection WordDirection() const
+ bool WordIsFromDictionary() const
+ int BlanksBeforeWord() const
+ bool WordIsNumeric() const
+ bool HasBlamerInfo() const
+ cchar_t *GetBlamerDebug() const
+ cchar_t *GetBlamerMisadaptionDebug() const
+ bool HasTruthString() const
+ bool EquivalentToTruth(cchar_t *) const
+ char *WordTruthUTF8Text() const
+ char *WordNormedUTF8Text() const
+ cchar_t *WordLattice(int *) const
+ bool SymbolIsSuperscript() const
+ bool SymbolIsSubscript() const
+ bool SymbolIsDropcap() const
+ ELSE:
+ cdef cppclass LTRResultIterator(PageIterator):
+ char *GetUTF8Text(PageIteratorLevel) const
+ void SetLineSeparator(cchar_t *)
+ void SetParagraphSeparator(cchar_t *)
+ float Confidence(PageIteratorLevel) const
+ cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool
*, bool *, int *, int *) const
+ cchar_t *WordRecognitionLanguage() const
+ StrongScriptDirection WordDirection() const
+ bool WordIsFromDictionary() const
+ bool WordIsNumeric() const
+ bool HasBlamerInfo() const
+ cchar_t *GetBlamerDebug() const
+ cchar_t *GetBlamerMisadaptionDebug() const
+ bool HasTruthString() const
+ bool EquivalentToTruth(cchar_t *) const
+ char *WordTruthUTF8Text() const
+ char *WordNormedUTF8Text() const
+ cchar_t *WordLattice(int *) const
+ bool SymbolIsSuperscript() const
+ bool SymbolIsSubscript() const
+ bool SymbolIsDropcap() const
cdef cppclass ChoiceIterator:
ChoiceIterator(const LTRResultIterator &) except +
@@ -168,8 +195,13 @@
float Confidence() const
cdef extern from "tesseract/resultiterator.h" namespace "tesseract" nogil:
- cdef cppclass ResultIterator(LTRResultIterator):
- bool ParagraphIsLtr() const
+ IF TESSERACT_VERSION >= 0x4000000:
+ cdef cppclass ResultIterator(LTRResultIterator):
+ bool ParagraphIsLtr() const
+ vector[vector[pair[cchar_tp, float]]] *GetBestLSTMSymbolChoices()
const
+ ELSE:
+ cdef cppclass ResultIterator(LTRResultIterator):
+ bool ParagraphIsLtr() const
cdef extern from "tesseract/renderer.h" namespace "tesseract" nogil:
cdef cppclass TessResultRenderer:
@@ -181,7 +213,7 @@
cdef cppclass TessHOcrRenderer(TessResultRenderer):
TessHOcrRenderer(cchar_t *, bool) except +
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
cdef cppclass TessPDFRenderer(TessResultRenderer):
TessPDFRenderer(cchar_t *, cchar_t *, bool) except +
ELSE:
@@ -194,7 +226,7 @@
cdef cppclass TessBoxTextRenderer(TessResultRenderer):
TessBoxTextRenderer(cchar_t *) except +
- IF TESSERACT_VERSION >= 0x030401:
+ IF TESSERACT_VERSION >= 0x3040100:
cdef cppclass TessOsdRenderer(TessResultRenderer):
TessOsdRenderer(cchar_t *) except +
@@ -213,7 +245,7 @@
cdef extern from "tesseract/baseapi.h" namespace "tesseract" nogil:
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
cdef enum OcrEngineMode:
OEM_TESSERACT_ONLY
OEM_LSTM_ONLY
@@ -253,7 +285,7 @@
RIL_WORD, # within a textline.
RIL_SYMBOL # character within a word.
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
cdef cppclass TessBaseAPI:
TessBaseAPI() except +
@staticmethod
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.3.1/tesserocr.egg-info/PKG-INFO
new/tesserocr-2.4.0/tesserocr.egg-info/PKG-INFO
--- old/tesserocr-2.3.1/tesserocr.egg-info/PKG-INFO 2018-08-13
19:35:29.000000000 +0200
+++ new/tesserocr-2.4.0/tesserocr.egg-info/PKG-INFO 2018-12-05
15:37:31.000000000 +0100
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: tesserocr
-Version: 2.3.1
+Version: 2.4.0
Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API
using Cython
Home-page: https://github.com/sirfz/tesserocr
Author: Fayez Zouheiry
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.3.1/tesserocr.pyx
new/tesserocr-2.4.0/tesserocr.pyx
--- old/tesserocr-2.3.1/tesserocr.pyx 2018-08-13 19:35:16.000000000 +0200
+++ new/tesserocr-2.4.0/tesserocr.pyx 2018-11-30 16:04:36.000000000 +0100
@@ -18,7 +18,7 @@
['eng', 'osd', 'equ'])
"""
-__version__ = '2.3.1'
+__version__ = '2.4.0'
import os
from io import BytesIO
@@ -48,7 +48,7 @@
cdef TessBaseAPI _api = TessBaseAPI()
_api.SetVariable('debug_file', '/dev/null') # suppress tesseract debug
messages
_api.Init(NULL, NULL)
-IF TESSERACT_VERSION >= 0x040000:
+IF TESSERACT_VERSION >= 0x3999800:
cdef _DEFAULT_PATH = _api.GetDatapath() # "tessdata/" is not appended by
tesseract since commit dba13db
ELSE:
cdef _DEFAULT_PATH = abspath(join(_api.GetDatapath(), os.pardir)) + os.sep
@@ -84,7 +84,7 @@
"""
TESSERACT_ONLY = OEM_TESSERACT_ONLY
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
LSTM_ONLY = OEM_LSTM_ONLY
TESSERACT_LSTM_COMBINED = OEM_TESSERACT_LSTM_COMBINED
ELSE:
@@ -896,6 +896,12 @@
"""Return True if the current word was found in a dictionary."""
return self._ltrriter.WordIsFromDictionary()
+ IF TESSERACT_VERSION >= 0x4000000:
+ def BlanksBeforeWord(self):
+ """Return True if the current word is numeric."""
+ return self._ltrriter.BlanksBeforeWord()
+
+
def WordIsNumeric(self):
"""Return True if the current word is numeric."""
return self._ltrriter.WordIsNumeric()
@@ -1017,6 +1023,17 @@
"""
return self._riter.ParagraphIsLtr()
+ IF TESSERACT_VERSION >= 0x4000000:
+ def GetBestLSTMSymbolChoices(self):
+ LSTMSymbolChoices = []
+ output = self._riter.GetBestLSTMSymbolChoices()[0]
+ for tstep in output:
+ timestep = []
+ for confpair in tstep:
+ timestep.append((confpair.first, confpair.second))
+ LSTMSymbolChoices.append(timestep)
+ return LSTMSymbolChoices
+
cdef class PyChoiceIterator:
@@ -1930,12 +1947,12 @@
cdef:
bool b
bool font_info
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
bool textonly
TessResultRenderer *temp
TessResultRenderer *renderer = NULL
- IF TESSERACT_VERSION >= 0x030401:
+ IF TESSERACT_VERSION >= 0x3040100:
if self._baseapi.GetPageSegMode() == PSM.OSD_ONLY:
renderer = new TessOsdRenderer(outputbase)
return renderer
@@ -1947,7 +1964,7 @@
self._baseapi.GetBoolVariable("tessedit_create_pdf", &b)
if b:
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
self._baseapi.GetBoolVariable("textonly_pdf", &textonly)
temp = new TessPDFRenderer(outputbase,
self._baseapi.GetDatapath(), textonly)
ELSE:
@@ -2111,6 +2128,25 @@
raise RuntimeError('Failed to recognize. No image set?')
return _free_str(text)
+ IF TESSERACT_VERSION >= 0x4000000:
+ def GetBestLSTMSymbolChoices(self):
+ """Return Symbol choices as multi-dimensional array of tupels. The
+ first dimension contains words. The second dimension contains the
LSTM
+ timesteps of the respective word. They are either accumulated over
+ characters or pure which depends on the value set in
lstm_choice_mode:
+ 1 = pure; 2 = accumulated. The third dimension contains the symbols
+ and their probability as tupels for the respective timestep.
+ Returns an empty list if :meth:`Recognize` was not called first.
+ """
+ if self.GetVariableAsString("lstm_choice_mode") == "0":
+ raise RuntimeError('lstm_choice_mode Parameter is 0. Set it to
1 or 2')
+ words = []
+ wi = self.GetIterator()
+ if wi:
+ for w in iterate_level(wi, RIL.WORD):
+ words.append(w.GetBestLSTMSymbolChoices())
+ return words
+
def GetHOCRText(self, int page_number):
"""Return a HTML-formatted string with hOCR markup from the internal
data structures.
@@ -2127,7 +2163,7 @@
raise RuntimeError('Failed to recognize. No image set?')
return _free_str(text)
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
def GetTSVText(self, int page_number):
"""Make a TSV-formatted string from the internal data structures.
@@ -2175,7 +2211,7 @@
raise RuntimeError('Failed to recognize. No image set?')
return _free_str(text)
- IF TESSERACT_VERSION >= 0x040000:
+ IF TESSERACT_VERSION >= 0x3999800:
def DetectOrientationScript(self):
"""Detect the orientation of the input image and apparent script
(alphabet).
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tesserocr-2.3.1/tests/test_api.py
new/tesserocr-2.4.0/tests/test_api.py
--- old/tesserocr-2.3.1/tests/test_api.py 2018-08-13 18:09:30.000000000
+0200
+++ new/tesserocr-2.4.0/tests/test_api.py 2018-11-30 16:23:48.000000000
+0100
@@ -10,12 +10,32 @@
def version_to_int(version):
+ subversion = None
+ subtrahend = 0
+ # Subtracts a certain amount from the version number to differentiate
between
+ # alpha, beta and release versions.
+ if "alpha" in version:
+ version_split = version.split("alpha")
+ subversion = version_split[1]
+ subtrahend = 2
+ elif "beta" in version:
+ version_split = version.split("beta")
+ subversion = version_split[1]
+ subtrahend = 1
version = re.search(r'((?:\d+\.)+\d+)', version).group()
# Split the groups on ".", take only the first one, and print each group
with leading 0 if needed
# To be safe, also handle cases where an extra group is added to the
version string, or if one or two groups
# are dropped.
version_groups = (version.split('.') + [0, 0])[:3]
version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
+ version_str = str((int(version_str, 10) - subtrahend))
+ # Adds a 2 digit subversion number for the subversionrelease.
+ subversion_str = "00"
+ if subversion is not None and subversion is not "":
+ subversion = re.search(r'(?:\d+)', subversion).group()
+ subversion_groups = (subversion.split('-') + [0, 0])[:1]
+ subversion_str = "{:02}".format(*map(int, subversion_groups))
+ version_str += subversion_str
return int(version_str, 16)
@@ -115,7 +135,7 @@
path = self._api.GetDatapath()
self._api.End()
self.assertRaises(RuntimeError, self._api.Init, path=(self._test_dir +
os.path.sep)) # no tessdata
- if _TESSERACT_VERSION >= 0x040000:
+ if _TESSERACT_VERSION >= 0x3999800:
new_path = path
else:
new_path = os.path.abspath(os.path.join(path, os.path.pardir)) +
os.path.sep
@@ -158,6 +178,46 @@
self.assertEqual([v[0] for v in mapped_confidences], words)
self.assertEqual([v[1] for v in mapped_confidences], confidences)
+ @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
+ def test_LSTM_choices(self):
+ """Test GetBestLSTMSymbolChoices."""
+ self._api.SetVariable("lstm_choice_mode", "2")
+ self._api.SetImageFile(self._image_file)
+ self._api.Recognize()
+ LSTM_choices = self._api.GetBestLSTMSymbolChoices()
+ words = self._api.AllWords()
+ self.assertEqual(len(words), len(LSTM_choices))
+
+ for choice, word in zip(LSTM_choices, words):
+ chosen_word = ""
+ for timestep in choice:
+ for alternative in timestep:
+ self.assertGreaterEqual(alternative[1], 0.0)
+ self.assertLessEqual(alternative[1], 2.0)
+ chosen_symbol = timestep[0][0]
+ if chosen_symbol != " ":
+ chosen_word += chosen_symbol
+ self.assertEqual(chosen_word, word)
+
+ @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
+ def test_result_iterator(self):
+ """Test result iterator."""
+ self._api.SetImageFile(self._image_file)
+ self._api.Recognize()
+ it = self._api.GetIterator()
+ level = tesserocr.RIL.WORD
+ for i, w in enumerate(tesserocr.iterate_level(it, level)):
+ text = w.GetUTF8Text(level)
+ blanks = w.BlanksBeforeWord()
+ if i == 0:
+ self.assertEqual(text, "The")
+ self.assertEqual(blanks, 0)
+ elif i == 1:
+ self.assertEqual(text, "(quick)")
+ self.assertEqual(blanks, 1)
+ else:
+ break
+
def test_detect_os(self):
"""Test DetectOS and DetectOrientationScript (tesseract v4+)."""
self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY)
@@ -166,7 +226,7 @@
all(self.assertIn(k, orientation) for k in ['sconfidence',
'oconfidence', 'script', 'orientation'])
self.assertEqual(orientation['orientation'], 0)
self.assertEqual(orientation['script'], 1)
- if _TESSERACT_VERSION >= 0x040000:
+ if _TESSERACT_VERSION >= 0x3999800:
orientation = self._api.DetectOrientationScript()
all(self.assertIn(k, orientation) for k in ['orient_deg',
'orient_conf', 'script_name', 'script_conf'])
self.assertEqual(orientation['orient_deg'], 0)