Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-pyocr for openSUSE:Factory checked in at 2021-08-28 22:29:33 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-pyocr (Old) and /work/SRC/openSUSE:Factory/.python-pyocr.new.1899 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-pyocr" Sat Aug 28 22:29:33 2021 rev:5 rq:914761 version:0.8 Changes: -------- --- /work/SRC/openSUSE:Factory/python-pyocr/python-pyocr.changes 2019-09-11 10:36:43.503273026 +0200 +++ /work/SRC/openSUSE:Factory/.python-pyocr.new.1899/python-pyocr.changes 2021-08-28 22:29:55.734026009 +0200 @@ -1,0 +2,9 @@ +Thu Aug 26 10:41:00 UTC 2021 - John Paul Adrian Glaubitz <[email protected]> + +- Update to 0.8: + * Replaced libtesseract.image_to_pdf() by an object-oriented API that allows + creating PDF with more than 1 page (thanks to Matthias Kraus). + * Tesseract 4 + sys.frozen=True: Fix TESSDATA_PREFIX: starting with + Tesseract 4, the path must include tessdata/ + +------------------------------------------------------------------- Old: ---- python-pyocr-0.7.2.tar.gz New: ---- python-pyocr-0.8.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-pyocr.spec ++++++ --- /var/tmp/diff_new_pack.ZVIiQh/_old 2021-08-28 22:29:56.114026431 +0200 +++ /var/tmp/diff_new_pack.ZVIiQh/_new 2021-08-28 22:29:56.118026436 +0200 @@ -1,7 +1,7 @@ # # spec file for package python-pyocr # -# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2021 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -16,11 +16,11 @@ # -%define sha f4b068cdf359186bfbed36959c53e9e52e2eda84 +%define sha f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc %define skip_python2 1 %{?!python_module:%define python_module() python-%{**} python3-%{**}} Name: python-pyocr -Version: 0.7.2 +Version: 0.8 Release: 0 Summary: Python wrapper for OCR engines License: GPL-3.0-or-later ++++++ python-pyocr-0.7.2.tar.gz -> python-pyocr-0.8.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.git_archival.txt new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.git_archival.txt --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.git_archival.txt 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.git_archival.txt 2021-01-01 16:59:54.000000000 +0100 @@ -1 +1 @@ -ref-names: tag: 0.7.2, refs/keep-around/f4b068cdf359186bfbed36959c53e9e52e2eda84 +ref-names: tag: 0.8, refs/keep-around/f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.gitlab-ci.yml new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.gitlab-ci.yml --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.gitlab-ci.yml 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.gitlab-ci.yml 2021-01-01 16:59:54.000000000 +0100 @@ -13,6 +13,9 @@ check: + only: + - branches@World/OpenPaperwork/pyocr + - tags@World/OpenPaperwork/pyocr tags: - linux - volatile @@ -23,12 +26,15 @@ test: + only: + - branches@World/OpenPaperwork/pyocr + - tags@World/OpenPaperwork/pyocr tags: - linux - volatile <<: *apt script: - - apt-get install -y -qq python-tox + - apt-get install -y -qq tox # required for Pillow - apt-get install -y -qq zlib1g-dev - apt-get install -y -qq libjpeg-dev diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/ChangeLog new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/ChangeLog --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/ChangeLog 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/ChangeLog 2021-01-01 16:59:54.000000000 +0100 @@ -1,3 +1,9 @@ +01/01/2020 - 0.8.0: +- Replaced libtesseract.image_to_pdf() by an object-oriented API that allows + creating PDF with more than 1 page (thanks to Matthias Kraus). +- Tesseract 4 + sys.frozen=True: Fix TESSDATA_PREFIX: starting with + Tesseract 4, the path must include tessdata/ + 22/06/2019 - 0.7.2: - Fix setup.py on Windows diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/README.md new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/README.md --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/README.md 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/README.md 2021-01-01 16:59:54.000000000 +0100 @@ -249,11 +249,28 @@ import PIL.Image import pyocr -pyocr.libtesseract.image_to_pdf( - PIL.Image.open("image.jpg"), - "output_filename" # .pdf will be appended -) +image = PIL.Image.open("image.jpg") +builder = pyocr.libtesseract.LibtesseractPdfBuilder() +builder.add_image(image) # multiple images are added as separate pages +builder.set_lang("deu") # optional +builder.set_output_file("output_filename") # .pdf will be appended +builder.build() +``` + +#### Add text layer to PDF + +```Python +import pyocr +import pdf2image + +images = pdf2image.convert_from_path("file.pdf", dpi=200, fmt='jpg') + +builder = pyocr.libtesseract.LibtesseractPdfBuilder() +for image in images: + builder.add_image(image) +builder.set_output_file("output") # .pdf will be appended +builder.build() ``` Beware this code hasn't been adapted to libtesseract 3 yet. @@ -296,7 +313,7 @@ ## Contact -* [Mailing-list](https://gitlab.gnome.org/World/OpenPaperwork/paperwork/wikis/Contact#mailing-list) +* [Forum](https://forum.openpaper.work/) * [Bug tracker](https://gitlab.gnome.org/World/OpenPaperwork/pyocr/issues) @@ -307,7 +324,7 @@ * [Paperwork](https://gitlab.gnome.org/World/OpenPaperwork/paperwork#readme) If you know of any other applications that use Pyocr, please -[tell us](https://gitlab.gnome.org/World/OpenPaperwork/paperwork/wikis/Contact#mailing-list) :-) +[tell us](https://forum.openpaper.work/) :-) ## Copyright diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/setup.py new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/setup.py --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/setup.py 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/setup.py 2021-01-01 16:59:54.000000000 +0100 @@ -54,8 +54,12 @@ setup( name="pyocr", - description=("A Python wrapper for OCR engines (Tesseract, Cuneiform," - " etc)"), + description=( + "A Python wrapper for OCR engines (Tesseract, Cuneiform, etc)" + ), + long_description=( + "A Python wrapper for OCR engines (Tesseract, Cuneiform, etc)" + ), keywords="tesseract cuneiform ocr", version=version, url="https://gitlab.gnome.org/World/OpenPaperwork/pyocr", @@ -87,7 +91,7 @@ }, data_files=[], scripts=[], - zip_safe=True, + zip_safe=(os.name != 'nt'), python_requires='>=3.4', install_requires=[ "Pillow", diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/__init__.py new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/__init__.py --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/__init__.py 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/__init__.py 2021-01-01 16:59:54.000000000 +0100 @@ -195,29 +195,96 @@ textonly: create pdf with only one invisible text layer. Defaults to False. ''' - handle = tesseract_raw.init(lang=lang) - renderer = None - try: - tesseract_raw.set_image(handle, image) - tesseract_raw.set_page_seg_mode( - handle, tesseract_raw.PageSegMode.AUTO_OSD - ) - - tesseract_raw.set_input_name(handle, input_file) - tesseract_raw.recognize(handle) - - renderer = tesseract_raw.init_pdf_renderer( - handle, output_file, textonly - ) - assert(renderer) - - tesseract_raw.begin_document(renderer, "") - tesseract_raw.add_renderer_image(handle, renderer) - tesseract_raw.end_document(renderer) - finally: - tesseract_raw.cleanup(handle) - if renderer: - tesseract_raw.cleanup(renderer) + LibtesseractPdfBuilder()\ + .set_lang(lang)\ + .set_output_file(output_file)\ + .set_text_only(textonly)\ + .add_image(image)\ + .build() + + +class LibtesseractPdfBuilder(object): + ''' + Creates a pdf file with embeded text based on OCR from one or more images. + ''' + + def __init__(self): + self.images = [] + self.output_file = None + self.lang = None + self.text_only = False + + def set_lang(self, lang): + ''' + Language to be used for ocr. + :param lang: three letter language code. For available languages see + https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages. + Defaults to None. + ''' + self.lang = lang + return self + + def set_output_file(self, output_file): + self.output_file = output_file + return self + + def set_text_only(self, text_only): + ''' + :param text_only: create pdf with only one invisible text layer. + Defaults to False. + ''' + self.text_only = text_only + return self + + def add_image(self, img): + ''' + Add an image to be converted to a page in the pdf + :param img: image to convert + ''' + self.images.append(img) # or something else + return self + + def __validate(self): + if len(self.images) < 1: + raise ValueError( + "At least one image is required to build the pdf!" + ) + + if self.output_file is None: + raise ValueError("An output-file is required to build the pdf!") + + def build(self): + ''' + Create and write PDF file. + ''' + self.__validate() + + handle = tesseract_raw.init(lang=self.lang) + renderer = None + try: + tesseract_raw.set_page_seg_mode( + handle, tesseract_raw.PageSegMode.AUTO_OSD + ) + + renderer = tesseract_raw.init_pdf_renderer( + handle, self.output_file, self.text_only + ) + assert renderer + + tesseract_raw.begin_document(renderer, "") + + for image in self.images: + tesseract_raw.set_image(handle, image) + + # tesseract_raw.set_input_name(handle, input_file) + tesseract_raw.recognize(handle) + + tesseract_raw.add_renderer_image(handle, renderer) + tesseract_raw.end_document(renderer) + finally: + tesseract_raw.cleanup(handle) + if renderer: + tesseract_raw.cleanup(renderer) def is_available(): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/tesseract_raw.py new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/tesseract_raw.py --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/tesseract_raw.py 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/tesseract_raw.py 2021-01-01 16:59:54.000000000 +0100 @@ -28,7 +28,7 @@ ) ) else: - TESSDATA_PREFIX = tessdata + TESSDATA_PREFIX = os.path.join(tessdata, "tessdata") if sys.platform[:3] == "win": # pragma: no cover diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/tesseract.py new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/tesseract.py --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/tesseract.py 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/tesseract.py 2021-01-01 16:59:54.000000000 +0100 @@ -119,8 +119,7 @@ if getattr(sys, 'frozen', False): # pragma: no cover # Pyinstaller support - path = os.environ["PATH"] - if sys._MEIPASS in path: + if 'TESSDATA_PREFIX' in os.environ: # already changed return @@ -128,15 +127,6 @@ tessprefix = os.path.join(sys._MEIPASS, "data") logger.info("Running in packaged environment") - if not os.path.exists(os.path.join(tessprefix, "tessdata")): - logger.warning( - "Running from container, but no tessdata ({}) found !".format( - tessprefix - ) - ) - else: - logger.info("TESSDATA_PREFIX set to [{}]".format(tessprefix)) - os.environ['TESSDATA_PREFIX'] = tessprefix if not os.path.exists(tesspath): logger.warning( "Running from container, but no tesseract ({}) found !".format( @@ -149,6 +139,19 @@ tesspath + os.pathsep + os.environ['PATH'] ) + if not os.path.exists(os.path.join(tessprefix, "tessdata")): + logger.warning( + "Running from container, but no tessdata ({}) found !".format( + tessprefix + ) + ) + else: + version = get_version(set_env=False) + if version[0] > 3: + tessprefix = os.path.join(tessprefix, "tessdata") + logger.info("TESSDATA_PREFIX set to [{}]".format(tessprefix)) + os.environ['TESSDATA_PREFIX'] = tessprefix + def can_detect_orientation(): version = get_version() @@ -413,7 +416,7 @@ return [lang for lang in langs if lang and lang[-1] != ':'] -def get_version(): +def get_version(set_env=True): """ Returns Tesseract version. @@ -428,15 +431,15 @@ if g_version is not None: return g_version - _set_environment() + if set_env: + _set_environment() command = [TESSERACT_CMD, "-v"] proc = subprocess.Popen(command, startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + stdout=subprocess.PIPE) ver_string = proc.stdout.read() ver_string = ver_string.decode('utf-8') ret = proc.wait() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_libtesseract.py new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_libtesseract.py --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_libtesseract.py 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_libtesseract.py 2021-01-01 16:59:54.000000000 +0100 @@ -1379,11 +1379,11 @@ def setUp(self): self.image = Image.new(mode="RGB", size=(1, 1)) - self.handle = randint(0, 2**32-1) + self.handle = 1234567 @patch("pyocr.libtesseract.tesseract_raw") def test_pdf(self, raw): - renderer = randint(0, 2**32-1) + renderer = 2345671 raw.init.return_value = self.handle raw.init_pdf_renderer.return_value = renderer libtesseract.image_to_pdf(self.image, "output") @@ -1393,7 +1393,6 @@ raw.set_page_seg_mode.assert_called_once_with( self.handle, raw.PageSegMode.AUTO_OSD ) - raw.set_input_name.assert_called_once_with(self.handle, "stdin") raw.recognize.assert_called_once_with(self.handle) raw.init_pdf_renderer.assert_called_once_with( self.handle, "output", False @@ -1408,6 +1407,41 @@ ) @patch("pyocr.libtesseract.tesseract_raw") + def test_multipage_pdf(self, raw): + renderer = 2345671 + raw.init.return_value = self.handle + raw.init_pdf_renderer.return_value = renderer + libtesseract.LibtesseractPdfBuilder() \ + .set_output_file("output")\ + .add_image(self.image)\ + .add_image(self.image)\ + .build() + + raw.init.assert_called_once_with(lang=None) + raw.set_image.assert_called_with(self.handle, self.image) + raw.set_image.assert_called_with(self.handle, self.image) + raw.set_page_seg_mode.assert_called_once_with( + self.handle, raw.PageSegMode.AUTO_OSD + ) + raw.recognize.assert_called_with(self.handle) + raw.recognize.assert_called_with(self.handle) + raw.init_pdf_renderer.assert_called_once_with( + self.handle, "output", False + ) + raw.begin_document.assert_called_once_with(renderer, "") + raw.add_renderer_image.assert_called_with( + self.handle, renderer + ) + raw.add_renderer_image.assert_called_with( + self.handle, renderer + ) + raw.end_document.assert_called_once_with(renderer) + self.assertListEqual( + raw.cleanup.call_args_list, + [call(self.handle), call(renderer)] + ) + + @patch("pyocr.libtesseract.tesseract_raw") def test_pdf_renderer_error(self, raw): renderer = None raw.init.return_value = self.handle @@ -1417,15 +1451,15 @@ libtesseract.image_to_pdf(self.image, "output") raw.init.assert_called_once_with(lang=None) - raw.set_image.assert_called_once_with(self.handle, self.image) raw.set_page_seg_mode.assert_called_once_with( self.handle, raw.PageSegMode.AUTO_OSD ) - raw.set_input_name.assert_called_once_with(self.handle, "stdin") - raw.recognize.assert_called_once_with(self.handle) raw.init_pdf_renderer.assert_called_once_with( self.handle, "output", False ) + self.assertFalse(raw.set_image.called) + self.assertFalse(raw.set_input_name.called) + self.assertFalse(raw.recognize.called) self.assertFalse(raw.begin_document.called) self.assertFalse(raw.add_renderer_image.called) self.assertFalse(raw.end_document.called) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_tesseract.py new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_tesseract.py --- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_tesseract.py 2019-06-22 20:10:54.000000000 +0200 +++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_tesseract.py 2021-01-01 16:59:54.000000000 +0100 @@ -54,6 +54,12 @@ popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) + # stderr must be explicitely ignored when calling 'tesseract -v'. + # See https://gitlab.gnome.org/World/OpenPaperwork/pyocr/-/issues/118 + popen.assert_called_once() + (args, kwargs) = popen.call_args + self.assertNotIn('stderr', kwargs) + @patch("subprocess.Popen") def test_version_tesseract4dev(self, popen): tesseract.g_version = None # drop cached version
