Mpaa has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/339906 )
Change subject: djvu.py: added features and refactored ...................................................................... djvu.py: added features and refactored Added: - cache control decorator - page number check decorator - retrieval of page info from djvu file Refactored: - tiny wrapper of subprocess Popen() to reduce code repetition - unicode support of filenames Added tests. Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf --- A djvu_ext.py M pywikibot/tools/djvu.py R tests/data/djvu/myfilé.djvu M tests/djvu_tests.py 4 files changed, 309 insertions(+), 45 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/06/339906/1 diff --git a/djvu_ext.py b/djvu_ext.py new file mode 100644 index 0000000..abe5e89 --- /dev/null +++ b/djvu_ext.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Wrapper around djvulibre to access djvu files properties and content.""" +from __future__ import absolute_import, unicode_literals + +import collections +import os +import re + +import pywikibot +import pywikibot.bot +import pywikibot.logging + +from pywikibot.tools.djvu import DjVuFile, call_cmd + +_logger = 'djvu_ext' +pywikibot.bot.init_handlers() + + +class MyDjVuFile(DjVuFile): + + """Extend class DjVuFile(object).""" + + def __init__(self, *args, **kwargs): + """Constructor.""" + super(MyDjVuFile, self).__init__(*args, **kwargs) + + # tmp files for creation/insertion of a white page. + self._white_ppm = os.path.join(self.dirname, 'white_page.ppm') + self._white_djvu = os.path.join(self.dirname, 'white_page.djvu') + + def replace_first_page(self): + """Replace first page of djvu file with blank page.""" + size, dpi = self.get_most_common_info() + + # Generate white_page. + res, data = call_cmd(['convert', '-size', size, 'xc:white', self._white_ppm], + lib='ImageMagik') + if not res: + return False + + # Convert white_page to djvu. + res, data = call_cmd(['c44', self._white_ppm, '-dpi', dpi]) + os.unlink(self._white_ppm) # rm white_page.ppm before retuning. + if not res: + return False + + # Delete first page. + # Get ids of first two pages for later checks. + n = self.number_of_images() + id1, _ = self.page_info(1) + info_p2 = self.page_info(2) + res, data = call_cmd(['djvm', '-d', self.file, '1']) + if not res: + return False + + # insert new first page + res, data = call_cmd(['djvm', '-i', self.file, self._white_djvu, '1']) + os.unlink(self._white_djvu) # rm white_page.djvu before returning. + if not res: + return False + + # Check if page processing is as expected. + expected_id = '{%s}' % os.path.basename(self._white_djvu) + assert self.number_of_images(force=True) == n + assert self.page_info(1) == (expected_id, (size, dpi)) # white page id. + assert self.page_info(2) == info_p2 # original 2nd page info. + + return True + + +file = u'Fréud_-_Reflections_on_war_and_death.djvu' + +args = ['djvudump', file] +call_cmd(['djvudump', file], lib='djvulibre') + +args = ['djvudump', file + '0'] +call_cmd(args, lib='djvulibre') + +djvuf = MyDjVuFile(file) +res = djvuf.page_info(3) +print(res) + +try: + res = djvuf.page_info(-1) +except ValueError as e: + pywikibot.output(e) +else: + print(res) + +sd = djvuf.get_most_common_info() +print(sd) +djvuf.replace_first_page() +cnt = collections.Counter(s_d for _, s_d in djvuf._get_page_info().values()) +print(cnt) \ No newline at end of file diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py index da7c595..8756fec 100644 --- a/pywikibot/tools/djvu.py +++ b/pywikibot/tools/djvu.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """Wrapper around djvulibre to access djvu files properties and content.""" # -# (C) Pywikibot team, 2015-2016 +# (C) Pywikibot team, 2015-2017 # # Distributed under the terms of the MIT license. # @@ -10,10 +10,55 @@ __version__ = '$Id$' -import os.path +import collections +import os +import re import subprocess +import sys -from pywikibot.logging import error +import pywikibot + +if sys.version_info[0] > 2: + basestring = (str,) + unicode = str + + +def call_cmd(args, lib='djvulibre'): + """ + Tiny wrapper around subprocess.Popen(). + + @param args: same as Popen() + @type args: sequence or string + + @param library: library to be logged in logging messages + @type library: string + + @param log: log process output; errors are always logged. + @type library: bool + + + @return: returns a tuple (res, stdoutdata), where + res is True if dp.returncode != 0 else False + """ + if not isinstance(args, basestring): + # upcast if any param in args is int + args = [unicode(el) for el in args] + cmd = ' '.join(args) + else: + cmd = args + + dp = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdoutdata, stderrdata = dp.communicate() + dp.wait() + + if dp.returncode != 0: + pywikibot.error('{0} error; {1}'.format(lib, cmd)) + pywikibot.error('{0}'.format(stderrdata)) + return (False, stdoutdata) + + pywikibot.log('SUCCESS: {0} (PID: {1})'.format(cmd, dp.pid)) + + return (True, stdoutdata) class DjVuFile(object): @@ -28,39 +73,137 @@ """ - def __init__(self, file_djvu): + def __init__(self, file): """ Constructor. - @param file_djvu: filename (including path) to djvu file - @type file_djvu: string/unicode + @param file: filename (including path) to djvu file + @type file: string/unicode """ - file_djvu = os.path.expanduser(file_djvu) + file = os.path.expanduser(file) + file = os.path.abspath(file) # Check file exists and has read permissions. - with open(file_djvu): - self.file_djvu = file_djvu + with open(file): + self.file = file + self.dirname = os.path.dirname(file) - def number_of_images(self): - """Return the (cached) number of images in the djvu file.""" - if not hasattr(self, '_image_count'): - dp = subprocess.Popen(['djvused', '-e', 'n', self.file_djvu], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = dp.communicate() - if dp.returncode != 0: - error('djvulibre library error!\n%s' % stderrdata) - self._image_count = int(stdoutdata) - return self._image_count + # pattern for parsing of djvudump output. + self._pat_form = re.compile( + r' *?FORM:DJVU *?\[\d+\] *?(?P<id>{[^\}]*?})? *?\[P(?P<n>\d+)\]') + self._pat_info = re.compile(r'DjVu.*?(?P<size>\d+x\d+).*?(?P<dpi>\d+) dpi') - def has_text(self): - """Test if the djvu file has a text-layer.""" + def check_cache(fn): # flake8: disable=N805 + """Decorator to check if cache shall be cleared.""" + cache = ['_page_count', '_has_text', '_page_info'] + + def wrapper(obj, *args, **kwargs): + force = kwargs.get('force', False) + # import pdb; pdb.set_trace() + if force: + for el in cache: + obj.__dict__.pop(el, None) + _res = fn(obj, *args, **kwargs) + return _res + return wrapper + + def check_page_number(fn): # flake8: disable=N805 + """Decorator to check if page number is valid. + + @raises ValueError + """ + def wrapper(obj, *args, **kwargs): + n = args[0] + force = kwargs.get('force', False) + if not (1 <= n <= obj.number_of_images(force=force)): + raise ValueError('Page %d not in file %s [%d-%d]' + % (n, obj.file, n, obj.number_of_images())) + _res = fn(obj, *args, **kwargs) + return _res + return wrapper + + @check_cache + def number_of_images(self, force=False): + """ + Return the number of images in the djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not hasattr(self, '_page_count'): + res, stdoutdata = call_cmd(['djvused', '-e', 'n', self.file]) + if not res: + return False + self._page_count = int(stdoutdata) + return self._page_count + + @check_page_number + def page_info(self, n, force=False): + """ + Return a tuple (id, (size, dpi)) for page n of djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not hasattr(self, '_page_info'): + self._get_page_info(force=force) + return self._page_info[n] + + @check_cache + def _get_page_info(self, force=False): + """ + Return a dict of tuples (id, (size, dpi)) for all pages of djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not hasattr(self, '_page_info'): + self._page_info = {} + + res, stdoutdata = call_cmd(['djvudump', self.file]) + if not res: + return False + + has_text = False + for line in stdoutdata.decode('utf-8').split('\n'): + if 'TXTz' in line: + has_text = True + + if 'FORM:DJVU' in line: + m = self._pat_form.search(line) + if m: + key, id = int(m.group('n')), m.group('id') + else: + key, id = '', 1 + + if 'INFO' in line: + m = self._pat_info.search(line) + if m: + size, dpi = m.group('size'), int(m.group('dpi')) + else: + size, dpi = None, None + else: + continue + + self._page_info[key] = (id, (size, dpi)) + self._has_text = has_text + return self._page_info + + def get_most_common_info(self): + """Return most common size and dpi for pages in djvu file.""" + cnt = collections.Counter(s_d for _, s_d in self._get_page_info().values()) + (size, dpi), _ = cnt.most_common()[0] + return size, dpi + + @check_cache + def has_text(self, force=False): + """ + Test if the djvu file has a text-layer. + + @param force: if True, refresh the cached data + @type force: bool + """ if not hasattr(self, '_has_text'): - dp = subprocess.Popen(['djvudump', self.file_djvu], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = dp.communicate() - if dp.returncode != 0: - error('djvulibre library error!\n%s' % stderrdata) - txt = stdoutdata.decode('utf-8') - self._has_text = 'TXTz' in txt + self._get_page_info(force=force) return self._has_text def _remove_control_chars(self, data): @@ -79,17 +222,23 @@ txt = txt.strip('\x0c\n ') return txt - def get_page(self, n): - """Get page n for djvu file.""" - if not self.has_text(): - raise ValueError('Djvu file %s has no text layer.' % self.file_djvu) - if not (1 <= n <= self.number_of_images()): - raise ValueError('Requested page number %d is not in file %s' - ' page range [%d-%d]' - % (n, self.file_djvu, 1, self.number_of_images())) - dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = dp.communicate() - if dp.returncode != 0: - error('djvulibre library error!\n%s' % stderrdata) + @check_page_number + @check_cache + def get_page(self, n, force=False): + """ + Get page n for djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not self.has_text(force=force): + raise ValueError('Djvu file %s has no text layer.' % self.file) + res, stdoutdata = call_cmd(['djvutxt', '--page=%d' % n, self.file]) + if not res: + return False return self._remove_control_chars(stdoutdata) + + # This is to be used only if this class is subclassed and the decorators + # needs to be used by the child. + check_page_number = staticmethod(check_page_number) + check_cache = staticmethod(check_cache) diff --git a/tests/data/djvu/myfile.djvu "b/tests/data/djvu/myfil\303\251.djvu" old mode 100755 new mode 100644 similarity index 100% rename from tests/data/djvu/myfile.djvu rename to "tests/data/djvu/myfil\303\251.djvu" Binary files differ diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py index ce38022..c2c412d 100644 --- a/tests/djvu_tests.py +++ b/tests/djvu_tests.py @@ -1,9 +1,9 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -"""Unit tests for djvutext.py script.""" +"""Unit tests for djvu.py.""" # -# (C) Pywikibot team, 2015 +# (C) Pywikibot team, 2017 # # Distributed under the terms of the MIT license. # @@ -27,7 +27,7 @@ net = False file_djvu_not_existing = join_djvu_data_path('not_existing.djvu') - file_djvu = join_djvu_data_path('myfile.djvu') + file_djvu = join_djvu_data_path('myfilé.djvu') file_djvu_wo_text = join_djvu_data_path('myfile_wo_text.djvu') test_txt = 'A file with non-ASCII characters, \nlike é or ç' @@ -44,13 +44,24 @@ def test_file_existance(self): """Test file existence checks.""" djvu = DjVuFile(self.file_djvu) - self.assertEqual(self.file_djvu, djvu.file_djvu) + self.assertEqual(self.file_djvu, djvu.file) self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing) def test_number_of_images(self): """Test page number generator.""" djvu = DjVuFile(self.file_djvu) self.assertEqual(djvu.number_of_images(), 4) + + def test_page_info(self): + """Test page info retrieval.""" + djvu = DjVuFile(self.file_djvu) + self.assertEqual(djvu.page_info(1), + ('{myfile.djvu}', ('1092x221', 600))) + + def test_get_most_common_info(self): + """Test page number generator.""" + djvu = DjVuFile(self.file_djvu) + self.assertEqual(djvu.get_most_common_info(), ('1092x221', 600)) def test_has_text(self): """Test if djvu file contains text.""" @@ -78,6 +89,15 @@ self.assertFalse(djvu.has_text()) self.assertRaises(ValueError, djvu.get_page, 100) + def test_clear_cache(self): + """Test if djvu file contains text.""" + djvu = DjVuFile(self.file_djvu) + self.assertTrue(djvu.has_text()) + djvu._has_text = False + self.assertFalse(djvu.has_text()) + self.assertTrue(djvu.has_text(force=True)) + + if __name__ == '__main__': # pragma: no cover try: unittest.main() -- To view, visit https://gerrit.wikimedia.org/r/339906 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Mpaa <mpaa.w...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits