jenkins-bot has submitted this change and it was merged.

Change subject: Added DjVuFile class and djvutext.py in core
......................................................................


Added DjVuFile class and djvutext.py in core

Added:
- DjVuFile class: wrapper to access djvu file text and properties
- added tests for DjVuFile class
- ported djvutext.py functionality from compat (basing it on Bot class)

Bug: T66853
Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb
---
A pywikibot/tools/djvu.py
A scripts/djvutext.py
A tests/data/djvu/myfile.djvu
A tests/data/djvu/myfile_wo_text.djvu
A tests/djvu_tests.py
5 files changed, 390 insertions(+), 0 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py
new file mode 100644
index 0000000..f22b89a
--- /dev/null
+++ b/pywikibot/tools/djvu.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Wrapper around djvulibre to access djvu files properties and content."""
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import os.path
+import subprocess
+
+import pywikibot
+
+
+class DjVuFile(object):
+
+    """Wrapper around djvulibre to access djvu files properties and content.
+
+    Perform file existance checks.
+
+    Control characters in djvu text-layer are converted for convenience
+    (see http://djvu.sourceforge.net/doc/man/djvused.html for control chars
+    details).
+
+    """
+
+    def __init__(self, file_djvu):
+        """
+        Constructor.
+
+        @param file_djvu: filename (including path) to djvu file
+        @type  file_djvu: string/unicode
+        """
+        file_djvu = os.path.expanduser(file_djvu)
+        # Check file exists and has read permissions.
+        with open(file_djvu):
+            self.file_djvu = file_djvu
+
+    def number_of_images(self):
+        """Return the (cached) number of images in the djvu file."""
+        if not hasattr(self, '_image_count'):
+            dp = subprocess.Popen(['djvused', '-e', 'n', self.file_djvu],
+                                  stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
+            (stdoutdata, stderrdata) = dp.communicate()
+            if dp.returncode != 0:
+                pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+            self._image_count = int(stdoutdata)
+        return self._image_count
+
+    def has_text(self):
+        """Test if the djvu file has a text-layer."""
+        if not hasattr(self, '_has_text'):
+            dp = subprocess.Popen(['djvudump', self.file_djvu],
+                                  stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
+            (stdoutdata, stderrdata) = dp.communicate()
+            if dp.returncode != 0:
+                pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+            txt = stdoutdata.decode('utf-8')
+            self._has_text = 'TXTz' in txt
+        return self._has_text
+
+    def _remove_control_chars(self, data):
+        """Remove djvu format control characters.
+
+        See http://djvu.sourceforge.net/doc/man/djvused.html for control chars.
+        """
+        txt = data.decode('utf-8')
+        # vertical tab (\013=\x0b): remove
+        txt = txt.replace('\x0b', '')
+        # group (\035=\x1d) separator: replace with \n
+        txt = txt.replace('\x1d', '\n')
+        # unit separator (\037=\x1f): replace with \n
+        txt = txt.replace('\x1f', '\n')
+        # feed char (\f=\x0c), \n and trailing spaces: strip
+        txt = txt.strip('\x0c\n ')
+        return txt
+
+    def get_page(self, n):
+        """Get page n for djvu file."""
+        if not self.has_text():
+            raise ValueError('Djvu file %s has no text layer.' % 
self.file_djvu)
+        if not (1 <= n <= self.number_of_images()):
+            raise ValueError('Requested page number %d is not in file %s'
+                             ' page range [%d-%d]'
+                             % (n, self.file_djvu, 1, self.number_of_images()))
+        dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu],
+                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        (stdoutdata, stderrdata) = dp.communicate()
+        if dp.returncode != 0:
+            pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+        return self._remove_control_chars(stdoutdata)
diff --git a/scripts/djvutext.py b/scripts/djvutext.py
new file mode 100644
index 0000000..3e274af
--- /dev/null
+++ b/scripts/djvutext.py
@@ -0,0 +1,210 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+This bot uploads text from djvu files onto pages in the "Page" namespace.
+
+It is intended to be used for Wikisource.
+
+The following parameters are supported:
+
+    -index:...     name of the index page (without the Index: prefix)
+    -djvu:...      path to the djvu file, it shall be:
+                   - path to a file name
+                   - dir where a djvu file name as index is located
+                   optional, by default is current dir '.'
+    -pages:<start>-<end>,...<start>-<end>,<start>-<end>
+                   Page range to upload;
+                   optional, start=1, end=djvu file number of images.
+                   Page ranges can be specified as:
+                     A-B -> pages A until B
+                     A-  -> pages A until number of images
+                     A   -> just page A
+                     -B  -> pages 1 until B
+    -summary:      custom edit summary.
+                   Use quotes if edit summary contains spaces.
+    -force         overwrites existing text
+                   optional, default False
+    -always        don't bother asking to confirm any of the changes.
+
+"""
+#
+# (C) Pywikibot team, 2008-2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import os.path
+
+import pywikibot
+
+from pywikibot import i18n, Bot
+from pywikibot.tools.djvu import DjVuFile
+from pywikibot.proofreadpage import ProofreadPage
+
+
+class DjVuTextBot(Bot):
+
+    """
+    A bot that uploads text-layer from djvu files to Page:namespace.
+
+    Works only on sites with Proofread Page extension installed.
+    """
+
+    def __init__(self, djvu, index, pages=None, **kwargs):
+        """
+        Constructor.
+
+        @param djvu: djvu from where to fetch the text layer
+        @type  djvu: DjVuFile object
+        @param index: index page in the Index: namespace
+        @type  index: Page object
+        @param pages: page interval to upload (start, end)
+        @type  pages: tuple
+        """
+        self.availableOptions.update({
+            'force': False,
+            'summary': None
+        })
+        super(DjVuTextBot, self).__init__(**kwargs)
+        self._djvu = djvu
+        self._index = index
+        self._prefix = self._index.title(withNamespace=False)
+
+        if not pages:
+            self._pages = (1, self._djvu.number_of_images())
+        else:
+            self._pages = pages
+
+        self.generator = self.gen()
+
+        # Get edit summary message if it's empty.
+        if not self.getOption('summary'):
+            self.options['summary'] = i18n.twntranslate(
+                self._index.site, 'djvutext-creating')
+
+    def page_number_gen(self):
+        """Generate pages numbers from specified page intervals."""
+        last = 0
+        for start, end in sorted(self._pages):
+            start = max(last, start)
+            last = end + 1
+            for page_number in range(start, last):
+                yield page_number
+
+    def gen(self):
+        """Generate pages from specified page interval."""
+        for page_number in self.page_number_gen():
+            title = '{prefix}/{number}'.format(prefix=self._prefix,
+                                               number=page_number)
+            page = ProofreadPage(self._index.site, title)
+            page.page_number = page_number  # remember page number in djvu file
+            yield page
+
+    def treat(self, page):
+        """Process one page."""
+        old_text = page.text
+
+        # Overwrite body of the page with content from djvu
+        page.body = self._djvu.get_page(page.page_number)
+
+        # Add username in header if page does not exists.
+        if not page.exists():
+            page.user = page.site.user()
+        new_text = page.text
+
+        summary = self.getOption('summary')
+        if page.exists() and not self.getOption('force'):
+            pywikibot.output('Page %s already exists, not adding!' % page)
+        else:
+            self.userPut(page, old_text, new_text,
+                         summary=summary, minor=True, botflag=True)
+
+
+def main(*args):
+    """
+    Process command line arguments and invoke bot.
+
+    If args is an empty list, sys.argv is used.
+
+    @param args: command line arguments
+    @type args: list of unicode
+    """
+    index = None
+    djvu_path = '.'  # default djvu file directory
+    pages = '1'
+    options = {}
+
+    # Parse command line arguments.
+    local_args = pywikibot.handle_args(args)
+    for arg in local_args:
+        if arg.startswith('-index:'):
+            index = arg[7:]
+        elif arg.startswith('-djvu:'):
+            djvu_path = arg[len('-djvu:'):]
+        elif arg.startswith('-pages:'):
+            pages = arg[7:]
+        elif arg.startswith('-summary:'):
+            options['summary'] = arg[len('-summary:'):]
+        elif arg == '-force':
+            options['force'] = True
+        elif arg == '-always':
+            options['always'] = True
+        else:
+            pywikibot.output('Unknown argument %s' % arg)
+
+    # index is mandatory.
+    if not index:
+        pywikibot.showHelp()
+        return False
+
+    # If djvu_path is not a fle, build djvu_path from dir+index.
+    djvu_path = os.path.expanduser(djvu_path)
+    djvu_path = os.path.abspath(djvu_path)
+    if not os.path.exists(djvu_path):
+        pywikibot.error('No such file or directory: %s' % djvu_path)
+        return False
+    if os.path.isdir(djvu_path):
+        djvu_path = os.path.join(djvu_path, index)
+
+    # Check the djvu file exists and, if so, create the DjVuFile wrapper.
+    djvu = DjVuFile(djvu_path)
+
+    if not djvu.has_text():
+        pywikibot.error('No text layer in djvu file %s' % djvu.file_djvu)
+        return False
+
+    # Parse pages param.
+    pages = pages.split(',')
+    for interval in range(len(pages)):
+        start, sep, end = pages[interval].partition('-')
+        start = 1 if not start else int(start)
+        if not sep:
+            end = start
+        else:
+            end = int(end) if end else djvu.number_of_images()
+        pages[interval] = (start, end)
+
+    site = pywikibot.Site()
+    if not site.has_extension('ProofreadPage'):
+        pywikibot.error('Site %s must have ProofreadPage extension.' % site)
+        return False
+
+    index_page = pywikibot.Page(site, index, ns=site.proofread_index_ns)
+
+    if not index_page.exists():
+        raise pywikibot.NoPage(index)
+
+    pywikibot.output('uploading text from %s to %s'
+                     % (djvu.file_djvu, index_page.title(asLink=True)))
+
+    bot = DjVuTextBot(djvu, index_page, pages, **options)
+    bot.run()
+
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception:
+        pywikibot.error('Fatal error:', exc_info=True)
diff --git a/tests/data/djvu/myfile.djvu b/tests/data/djvu/myfile.djvu
new file mode 100755
index 0000000..eedbbcb
--- /dev/null
+++ b/tests/data/djvu/myfile.djvu
Binary files differ
diff --git a/tests/data/djvu/myfile_wo_text.djvu 
b/tests/data/djvu/myfile_wo_text.djvu
new file mode 100644
index 0000000..a40d16b
--- /dev/null
+++ b/tests/data/djvu/myfile_wo_text.djvu
Binary files differ
diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py
new file mode 100644
index 0000000..a3dcc43
--- /dev/null
+++ b/tests/djvu_tests.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""Unit tests for djvutext.py script."""
+
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+#
+
+from __future__ import unicode_literals
+
+import os
+import subprocess
+
+from tests import _data_dir
+from tests.aspects import unittest, TestCase
+from pywikibot.tools.djvu import DjVuFile
+
+_djvu_dir = 'djvu'
+
+
+class TestDjVuFile(TestCase):
+
+    """Test DjVuFile class."""
+
+    net = False
+
+    file_djvu_not_existing = os.path.join(_data_dir, _djvu_dir, 
'not_existing.djvu')
+    file_djvu = os.path.join(_data_dir, _djvu_dir, 'myfile.djvu')
+    file_djvu_wo_text = os.path.join(_data_dir, _djvu_dir, 
'myfile_wo_text.djvu')
+    test_txt = 'A file with non-ASCII characters, \nlike é or ç'
+
+    @classmethod
+    def setUpClass(cls):
+        """SetUp tests."""
+        super(TestDjVuFile, cls).setUpClass()
+        try:
+            subprocess.Popen(['djvudump'],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except OSError:
+            raise unittest.SkipTest('djvulibre library not installed.')
+
+    def test_file_existance(self):
+        """Test file existence checks."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertEqual(self.file_djvu, djvu.file_djvu)
+        self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing)
+
+    def test_number_of_images(self):
+        """Test page number generator."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertEqual(djvu.number_of_images(), 4)
+
+    def test_has_text(self):
+        """Test if djvu file contains text."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        djvu = DjVuFile(self.file_djvu_wo_text)
+        self.assertFalse(djvu.has_text())
+
+    def test_get_existing_page_number(self):
+        """Test if djvu file contains text."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        txt = djvu.get_page(1)
+        self.assertEqual(txt, self.test_txt)
+
+    def test_get_not_existing_page_number(self):
+        """Test if djvu file contains text."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        self.assertRaises(ValueError, djvu.get_page, 100)
+
+    def test_get_not_existing_page(self):
+        """Test if djvu file contains text."""
+        djvu = DjVuFile(self.file_djvu_wo_text)
+        self.assertFalse(djvu.has_text())
+        self.assertRaises(ValueError, djvu.get_page, 100)
+
+if __name__ == '__main__':
+    try:
+        unittest.main()
+    except SystemExit:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/210808
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb
Gerrit-PatchSet: 17
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Mpaa <[email protected]>
Gerrit-Reviewer: Ricordisamoa <[email protected]>
Gerrit-Reviewer: XZise <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to