[MediaWiki-commits] [Gerrit] pywikibot/core[master]: djvu.py: added features and refactored

Mpaa (Code Review) Sat, 25 Feb 2017 14:24:07 -0800

Mpaa has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/339906 )


Change subject: djvu.py: added features and refactored
......................................................................

djvu.py: added features and refactored

Added:
- cache control decorator
- page number check decorator
- retrieval of page info from djvu file

Refactored:
- tiny wrapper of subprocess Popen() to reduce code repetition
- unicode support of filenames

Added tests.

Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf
---
A djvu_ext.py
M pywikibot/tools/djvu.py
R tests/data/djvu/myfilé.djvu
M tests/djvu_tests.py
4 files changed, 309 insertions(+), 45 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/06/339906/1

diff --git a/djvu_ext.py b/djvu_ext.py
new file mode 100644
index 0000000..abe5e89
--- /dev/null
+++ b/djvu_ext.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Wrapper around djvulibre to access djvu files properties and content."""
+from __future__ import absolute_import, unicode_literals
+
+import collections
+import os
+import re
+
+import pywikibot
+import pywikibot.bot
+import pywikibot.logging
+
+from pywikibot.tools.djvu import DjVuFile, call_cmd
+
+_logger = 'djvu_ext'
+pywikibot.bot.init_handlers()
+
+
+class MyDjVuFile(DjVuFile):
+
+    """Extend class DjVuFile(object)."""
+
+    def __init__(self, *args, **kwargs):
+        """Constructor."""
+        super(MyDjVuFile, self).__init__(*args, **kwargs)
+
+        # tmp files for creation/insertion of a white page.
+        self._white_ppm = os.path.join(self.dirname, 'white_page.ppm')
+        self._white_djvu = os.path.join(self.dirname, 'white_page.djvu')
+
+    def replace_first_page(self):
+        """Replace first page of djvu file with blank page."""
+        size, dpi = self.get_most_common_info()
+
+        # Generate white_page.
+        res, data = call_cmd(['convert', '-size', size, 'xc:white', 
self._white_ppm],
+                             lib='ImageMagik')
+        if not res:
+            return False
+
+        # Convert white_page to djvu.
+        res, data = call_cmd(['c44', self._white_ppm, '-dpi', dpi])
+        os.unlink(self._white_ppm)  # rm white_page.ppm before retuning.
+        if not res:
+            return False
+
+        # Delete first page.
+        # Get ids of first two pages for later checks.
+        n = self.number_of_images()
+        id1, _ = self.page_info(1)
+        info_p2 = self.page_info(2)
+        res, data = call_cmd(['djvm', '-d', self.file, '1'])
+        if not res:
+            return False
+
+        # insert new first page
+        res, data = call_cmd(['djvm', '-i', self.file, self._white_djvu, '1'])
+        os.unlink(self._white_djvu)  # rm white_page.djvu before returning.
+        if not res:
+            return False
+
+        # Check if page processing is as expected.
+        expected_id = '{%s}' % os.path.basename(self._white_djvu)
+        assert self.number_of_images(force=True) == n
+        assert self.page_info(1) == (expected_id, (size, dpi))  # white page 
id.
+        assert self.page_info(2) == info_p2  # original 2nd page info.
+
+        return True
+
+
+file = u'Fréud_-_Reflections_on_war_and_death.djvu'
+
+args = ['djvudump', file]
+call_cmd(['djvudump', file], lib='djvulibre')
+
+args = ['djvudump', file + '0']
+call_cmd(args, lib='djvulibre')
+
+djvuf = MyDjVuFile(file)
+res = djvuf.page_info(3)
+print(res)
+
+try:
+    res = djvuf.page_info(-1)
+except ValueError as e:
+    pywikibot.output(e)
+else:
+    print(res)
+
+sd = djvuf.get_most_common_info()
+print(sd)
+djvuf.replace_first_page()
+cnt = collections.Counter(s_d for _, s_d in djvuf._get_page_info().values())
+print(cnt)
\ No newline at end of file
diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py
index da7c595..8756fec 100644
--- a/pywikibot/tools/djvu.py
+++ b/pywikibot/tools/djvu.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """Wrapper around djvulibre to access djvu files properties and content."""
 #
-# (C) Pywikibot team, 2015-2016
+# (C) Pywikibot team, 2015-2017
 #
 # Distributed under the terms of the MIT license.
 #
@@ -10,10 +10,55 @@
 
 __version__ = '$Id$'
 
-import os.path
+import collections
+import os
+import re
 import subprocess
+import sys
 
-from pywikibot.logging import error
+import pywikibot
+
+if sys.version_info[0] > 2:
+    basestring = (str,)
+    unicode = str
+
+
+def call_cmd(args, lib='djvulibre'):
+    """
+    Tiny wrapper around subprocess.Popen().
+
+    @param args: same as Popen()
+    @type args: sequence or string
+
+    @param library: library to be logged in logging messages
+    @type library: string
+
+    @param log: log process output; errors are always logged.
+    @type library: bool
+
+
+    @return: returns a tuple (res, stdoutdata), where
+        res is True if dp.returncode != 0 else False
+    """
+    if not isinstance(args, basestring):
+        # upcast if any param in args is int
+        args = [unicode(el) for el in args]
+        cmd = ' '.join(args)
+    else:
+        cmd = args
+
+    dp = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdoutdata, stderrdata = dp.communicate()
+    dp.wait()
+
+    if dp.returncode != 0:
+        pywikibot.error('{0} error; {1}'.format(lib, cmd))
+        pywikibot.error('{0}'.format(stderrdata))
+        return (False, stdoutdata)
+
+    pywikibot.log('SUCCESS: {0} (PID: {1})'.format(cmd, dp.pid))
+
+    return (True, stdoutdata)
 
 
 class DjVuFile(object):
@@ -28,39 +73,137 @@
 
     """
 
-    def __init__(self, file_djvu):
+    def __init__(self, file):
         """
         Constructor.
 
-        @param file_djvu: filename (including path) to djvu file
-        @type file_djvu: string/unicode
+        @param file: filename (including path) to djvu file
+        @type file: string/unicode
         """
-        file_djvu = os.path.expanduser(file_djvu)
+        file = os.path.expanduser(file)
+        file = os.path.abspath(file)
         # Check file exists and has read permissions.
-        with open(file_djvu):
-            self.file_djvu = file_djvu
+        with open(file):
+            self.file = file
+        self.dirname = os.path.dirname(file)
 
-    def number_of_images(self):
-        """Return the (cached) number of images in the djvu file."""
-        if not hasattr(self, '_image_count'):
-            dp = subprocess.Popen(['djvused', '-e', 'n', self.file_djvu],
-                                  stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
-            (stdoutdata, stderrdata) = dp.communicate()
-            if dp.returncode != 0:
-                error('djvulibre library error!\n%s' % stderrdata)
-            self._image_count = int(stdoutdata)
-        return self._image_count
+        # pattern for parsing of djvudump output.
+        self._pat_form = re.compile(
+            r' *?FORM:DJVU *?\[\d+\] *?(?P<id>{[^\}]*?})? *?\[P(?P<n>\d+)\]')
+        self._pat_info = re.compile(r'DjVu.*?(?P<size>\d+x\d+).*?(?P<dpi>\d+) 
dpi')
 
-    def has_text(self):
-        """Test if the djvu file has a text-layer."""
+    def check_cache(fn):  # flake8: disable=N805
+        """Decorator to check if cache shall be cleared."""
+        cache = ['_page_count', '_has_text', '_page_info']
+
+        def wrapper(obj, *args, **kwargs):
+            force = kwargs.get('force', False)
+            # import pdb; pdb.set_trace()
+            if force:
+                for el in cache:
+                    obj.__dict__.pop(el, None)
+            _res = fn(obj, *args, **kwargs)
+            return _res
+        return wrapper
+
+    def check_page_number(fn):  # flake8: disable=N805
+        """Decorator to check if page number is valid.
+
+        @raises ValueError
+        """
+        def wrapper(obj, *args, **kwargs):
+            n = args[0]
+            force = kwargs.get('force', False)
+            if not (1 <= n <= obj.number_of_images(force=force)):
+                raise ValueError('Page %d not in file %s [%d-%d]'
+                                 % (n, obj.file, n, obj.number_of_images()))
+            _res = fn(obj, *args, **kwargs)
+            return _res
+        return wrapper
+
+    @check_cache
+    def number_of_images(self, force=False):
+        """
+        Return the number of images in the djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not hasattr(self, '_page_count'):
+            res, stdoutdata = call_cmd(['djvused', '-e', 'n', self.file])
+            if not res:
+                return False
+            self._page_count = int(stdoutdata)
+        return self._page_count
+
+    @check_page_number
+    def page_info(self, n, force=False):
+        """
+        Return a tuple (id, (size, dpi)) for page n of djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not hasattr(self, '_page_info'):
+            self._get_page_info(force=force)
+        return self._page_info[n]
+
+    @check_cache
+    def _get_page_info(self, force=False):
+        """
+        Return a dict of tuples (id, (size, dpi)) for all pages of djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not hasattr(self, '_page_info'):
+            self._page_info = {}
+
+            res, stdoutdata = call_cmd(['djvudump', self.file])
+            if not res:
+                return False
+
+            has_text = False
+            for line in stdoutdata.decode('utf-8').split('\n'):
+                if 'TXTz' in line:
+                    has_text = True
+
+                if 'FORM:DJVU' in line:
+                    m = self._pat_form.search(line)
+                    if m:
+                        key, id = int(m.group('n')), m.group('id')
+                    else:
+                        key, id = '', 1
+
+                if 'INFO' in line:
+                    m = self._pat_info.search(line)
+                    if m:
+                        size, dpi = m.group('size'), int(m.group('dpi'))
+                    else:
+                        size, dpi = None, None
+                else:
+                    continue
+
+                self._page_info[key] = (id, (size, dpi))
+            self._has_text = has_text
+        return self._page_info
+
+    def get_most_common_info(self):
+        """Return most common size and dpi for pages in djvu file."""
+        cnt = collections.Counter(s_d for _, s_d in 
self._get_page_info().values())
+        (size, dpi), _ = cnt.most_common()[0]
+        return size, dpi
+
+    @check_cache
+    def has_text(self, force=False):
+        """
+        Test if the djvu file has a text-layer.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
         if not hasattr(self, '_has_text'):
-            dp = subprocess.Popen(['djvudump', self.file_djvu],
-                                  stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
-            (stdoutdata, stderrdata) = dp.communicate()
-            if dp.returncode != 0:
-                error('djvulibre library error!\n%s' % stderrdata)
-            txt = stdoutdata.decode('utf-8')
-            self._has_text = 'TXTz' in txt
+            self._get_page_info(force=force)
         return self._has_text
 
     def _remove_control_chars(self, data):
@@ -79,17 +222,23 @@
         txt = txt.strip('\x0c\n ')
         return txt
 
-    def get_page(self, n):
-        """Get page n for djvu file."""
-        if not self.has_text():
-            raise ValueError('Djvu file %s has no text layer.' % 
self.file_djvu)
-        if not (1 <= n <= self.number_of_images()):
-            raise ValueError('Requested page number %d is not in file %s'
-                             ' page range [%d-%d]'
-                             % (n, self.file_djvu, 1, self.number_of_images()))
-        dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu],
-                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        (stdoutdata, stderrdata) = dp.communicate()
-        if dp.returncode != 0:
-            error('djvulibre library error!\n%s' % stderrdata)
+    @check_page_number
+    @check_cache
+    def get_page(self, n, force=False):
+        """
+        Get page n for djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not self.has_text(force=force):
+            raise ValueError('Djvu file %s has no text layer.' % self.file)
+        res, stdoutdata = call_cmd(['djvutxt', '--page=%d' % n, self.file])
+        if not res:
+            return False
         return self._remove_control_chars(stdoutdata)
+
+    # This is to be used only if this class is subclassed and the decorators
+    # needs to be used by the child.
+    check_page_number = staticmethod(check_page_number)
+    check_cache = staticmethod(check_cache)
diff --git a/tests/data/djvu/myfile.djvu "b/tests/data/djvu/myfil\303\251.djvu"
old mode 100755
new mode 100644
similarity index 100%
rename from tests/data/djvu/myfile.djvu
rename to "tests/data/djvu/myfil\303\251.djvu"
Binary files differ
diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py
index ce38022..c2c412d 100644
--- a/tests/djvu_tests.py
+++ b/tests/djvu_tests.py
@@ -1,9 +1,9 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
-"""Unit tests for djvutext.py script."""
+"""Unit tests for djvu.py."""
 
 #
-# (C) Pywikibot team, 2015
+# (C) Pywikibot team, 2017
 #
 # Distributed under the terms of the MIT license.
 #
@@ -27,7 +27,7 @@
     net = False
 
     file_djvu_not_existing = join_djvu_data_path('not_existing.djvu')
-    file_djvu = join_djvu_data_path('myfile.djvu')
+    file_djvu = join_djvu_data_path('myfilé.djvu')
     file_djvu_wo_text = join_djvu_data_path('myfile_wo_text.djvu')
     test_txt = 'A file with non-ASCII characters, \nlike é or ç'
 
@@ -44,13 +44,24 @@
     def test_file_existance(self):
         """Test file existence checks."""
         djvu = DjVuFile(self.file_djvu)
-        self.assertEqual(self.file_djvu, djvu.file_djvu)
+        self.assertEqual(self.file_djvu, djvu.file)
         self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing)
 
     def test_number_of_images(self):
         """Test page number generator."""
         djvu = DjVuFile(self.file_djvu)
         self.assertEqual(djvu.number_of_images(), 4)
+
+    def test_page_info(self):
+        """Test page info retrieval."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertEqual(djvu.page_info(1),
+                         ('{myfile.djvu}', ('1092x221', 600)))
+
+    def test_get_most_common_info(self):
+        """Test page number generator."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertEqual(djvu.get_most_common_info(), ('1092x221', 600))
 
     def test_has_text(self):
         """Test if djvu file contains text."""
@@ -78,6 +89,15 @@
         self.assertFalse(djvu.has_text())
         self.assertRaises(ValueError, djvu.get_page, 100)
 
+    def test_clear_cache(self):
+        """Test if djvu file contains text."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        djvu._has_text = False
+        self.assertFalse(djvu.has_text())
+        self.assertTrue(djvu.has_text(force=True))
+
+
 if __name__ == '__main__':  # pragma: no cover
     try:
         unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/339906
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.w...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] pywikibot/core[master]: djvu.py: added features and refactored

Reply via email to