jenkins-bot has submitted this change and it was merged. Change subject: Added DjVuFile class and djvutext.py in core ......................................................................
Added DjVuFile class and djvutext.py in core Added: - DjVuFile class: wrapper to access djvu file text and properties - added tests for DjVuFile class - ported djvutext.py functionality from compat (basing it on Bot class) Bug: T66853 Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb --- A pywikibot/tools/djvu.py A scripts/djvutext.py A tests/data/djvu/myfile.djvu A tests/data/djvu/myfile_wo_text.djvu A tests/djvu_tests.py 5 files changed, 390 insertions(+), 0 deletions(-) Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py new file mode 100644 index 0000000..f22b89a --- /dev/null +++ b/pywikibot/tools/djvu.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Wrapper around djvulibre to access djvu files properties and content.""" +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' + +import os.path +import subprocess + +import pywikibot + + +class DjVuFile(object): + + """Wrapper around djvulibre to access djvu files properties and content. + + Perform file existance checks. + + Control characters in djvu text-layer are converted for convenience + (see http://djvu.sourceforge.net/doc/man/djvused.html for control chars + details). + + """ + + def __init__(self, file_djvu): + """ + Constructor. + + @param file_djvu: filename (including path) to djvu file + @type file_djvu: string/unicode + """ + file_djvu = os.path.expanduser(file_djvu) + # Check file exists and has read permissions. + with open(file_djvu): + self.file_djvu = file_djvu + + def number_of_images(self): + """Return the (cached) number of images in the djvu file.""" + if not hasattr(self, '_image_count'): + dp = subprocess.Popen(['djvused', '-e', 'n', self.file_djvu], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error('djvulibre library error!\n%s' % stderrdata) + self._image_count = int(stdoutdata) + return self._image_count + + def has_text(self): + """Test if the djvu file has a text-layer.""" + if not hasattr(self, '_has_text'): + dp = subprocess.Popen(['djvudump', self.file_djvu], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error('djvulibre library error!\n%s' % stderrdata) + txt = stdoutdata.decode('utf-8') + self._has_text = 'TXTz' in txt + return self._has_text + + def _remove_control_chars(self, data): + """Remove djvu format control characters. + + See http://djvu.sourceforge.net/doc/man/djvused.html for control chars. + """ + txt = data.decode('utf-8') + # vertical tab (\013=\x0b): remove + txt = txt.replace('\x0b', '') + # group (\035=\x1d) separator: replace with \n + txt = txt.replace('\x1d', '\n') + # unit separator (\037=\x1f): replace with \n + txt = txt.replace('\x1f', '\n') + # feed char (\f=\x0c), \n and trailing spaces: strip + txt = txt.strip('\x0c\n ') + return txt + + def get_page(self, n): + """Get page n for djvu file.""" + if not self.has_text(): + raise ValueError('Djvu file %s has no text layer.' % self.file_djvu) + if not (1 <= n <= self.number_of_images()): + raise ValueError('Requested page number %d is not in file %s' + ' page range [%d-%d]' + % (n, self.file_djvu, 1, self.number_of_images())) + dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error('djvulibre library error!\n%s' % stderrdata) + return self._remove_control_chars(stdoutdata) diff --git a/scripts/djvutext.py b/scripts/djvutext.py new file mode 100644 index 0000000..3e274af --- /dev/null +++ b/scripts/djvutext.py @@ -0,0 +1,210 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +This bot uploads text from djvu files onto pages in the "Page" namespace. + +It is intended to be used for Wikisource. + +The following parameters are supported: + + -index:... name of the index page (without the Index: prefix) + -djvu:... path to the djvu file, it shall be: + - path to a file name + - dir where a djvu file name as index is located + optional, by default is current dir '.' + -pages:<start>-<end>,...<start>-<end>,<start>-<end> + Page range to upload; + optional, start=1, end=djvu file number of images. + Page ranges can be specified as: + A-B -> pages A until B + A- -> pages A until number of images + A -> just page A + -B -> pages 1 until B + -summary: custom edit summary. + Use quotes if edit summary contains spaces. + -force overwrites existing text + optional, default False + -always don't bother asking to confirm any of the changes. + +""" +# +# (C) Pywikibot team, 2008-2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' + +import os.path + +import pywikibot + +from pywikibot import i18n, Bot +from pywikibot.tools.djvu import DjVuFile +from pywikibot.proofreadpage import ProofreadPage + + +class DjVuTextBot(Bot): + + """ + A bot that uploads text-layer from djvu files to Page:namespace. + + Works only on sites with Proofread Page extension installed. + """ + + def __init__(self, djvu, index, pages=None, **kwargs): + """ + Constructor. + + @param djvu: djvu from where to fetch the text layer + @type djvu: DjVuFile object + @param index: index page in the Index: namespace + @type index: Page object + @param pages: page interval to upload (start, end) + @type pages: tuple + """ + self.availableOptions.update({ + 'force': False, + 'summary': None + }) + super(DjVuTextBot, self).__init__(**kwargs) + self._djvu = djvu + self._index = index + self._prefix = self._index.title(withNamespace=False) + + if not pages: + self._pages = (1, self._djvu.number_of_images()) + else: + self._pages = pages + + self.generator = self.gen() + + # Get edit summary message if it's empty. + if not self.getOption('summary'): + self.options['summary'] = i18n.twntranslate( + self._index.site, 'djvutext-creating') + + def page_number_gen(self): + """Generate pages numbers from specified page intervals.""" + last = 0 + for start, end in sorted(self._pages): + start = max(last, start) + last = end + 1 + for page_number in range(start, last): + yield page_number + + def gen(self): + """Generate pages from specified page interval.""" + for page_number in self.page_number_gen(): + title = '{prefix}/{number}'.format(prefix=self._prefix, + number=page_number) + page = ProofreadPage(self._index.site, title) + page.page_number = page_number # remember page number in djvu file + yield page + + def treat(self, page): + """Process one page.""" + old_text = page.text + + # Overwrite body of the page with content from djvu + page.body = self._djvu.get_page(page.page_number) + + # Add username in header if page does not exists. + if not page.exists(): + page.user = page.site.user() + new_text = page.text + + summary = self.getOption('summary') + if page.exists() and not self.getOption('force'): + pywikibot.output('Page %s already exists, not adding!' % page) + else: + self.userPut(page, old_text, new_text, + summary=summary, minor=True, botflag=True) + + +def main(*args): + """ + Process command line arguments and invoke bot. + + If args is an empty list, sys.argv is used. + + @param args: command line arguments + @type args: list of unicode + """ + index = None + djvu_path = '.' # default djvu file directory + pages = '1' + options = {} + + # Parse command line arguments. + local_args = pywikibot.handle_args(args) + for arg in local_args: + if arg.startswith('-index:'): + index = arg[7:] + elif arg.startswith('-djvu:'): + djvu_path = arg[len('-djvu:'):] + elif arg.startswith('-pages:'): + pages = arg[7:] + elif arg.startswith('-summary:'): + options['summary'] = arg[len('-summary:'):] + elif arg == '-force': + options['force'] = True + elif arg == '-always': + options['always'] = True + else: + pywikibot.output('Unknown argument %s' % arg) + + # index is mandatory. + if not index: + pywikibot.showHelp() + return False + + # If djvu_path is not a fle, build djvu_path from dir+index. + djvu_path = os.path.expanduser(djvu_path) + djvu_path = os.path.abspath(djvu_path) + if not os.path.exists(djvu_path): + pywikibot.error('No such file or directory: %s' % djvu_path) + return False + if os.path.isdir(djvu_path): + djvu_path = os.path.join(djvu_path, index) + + # Check the djvu file exists and, if so, create the DjVuFile wrapper. + djvu = DjVuFile(djvu_path) + + if not djvu.has_text(): + pywikibot.error('No text layer in djvu file %s' % djvu.file_djvu) + return False + + # Parse pages param. + pages = pages.split(',') + for interval in range(len(pages)): + start, sep, end = pages[interval].partition('-') + start = 1 if not start else int(start) + if not sep: + end = start + else: + end = int(end) if end else djvu.number_of_images() + pages[interval] = (start, end) + + site = pywikibot.Site() + if not site.has_extension('ProofreadPage'): + pywikibot.error('Site %s must have ProofreadPage extension.' % site) + return False + + index_page = pywikibot.Page(site, index, ns=site.proofread_index_ns) + + if not index_page.exists(): + raise pywikibot.NoPage(index) + + pywikibot.output('uploading text from %s to %s' + % (djvu.file_djvu, index_page.title(asLink=True))) + + bot = DjVuTextBot(djvu, index_page, pages, **options) + bot.run() + +if __name__ == '__main__': + try: + main() + except Exception: + pywikibot.error('Fatal error:', exc_info=True) diff --git a/tests/data/djvu/myfile.djvu b/tests/data/djvu/myfile.djvu new file mode 100755 index 0000000..eedbbcb --- /dev/null +++ b/tests/data/djvu/myfile.djvu Binary files differ diff --git a/tests/data/djvu/myfile_wo_text.djvu b/tests/data/djvu/myfile_wo_text.djvu new file mode 100644 index 0000000..a40d16b --- /dev/null +++ b/tests/data/djvu/myfile_wo_text.djvu Binary files differ diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py new file mode 100644 index 0000000..a3dcc43 --- /dev/null +++ b/tests/djvu_tests.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Unit tests for djvutext.py script.""" + +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +# + +from __future__ import unicode_literals + +import os +import subprocess + +from tests import _data_dir +from tests.aspects import unittest, TestCase +from pywikibot.tools.djvu import DjVuFile + +_djvu_dir = 'djvu' + + +class TestDjVuFile(TestCase): + + """Test DjVuFile class.""" + + net = False + + file_djvu_not_existing = os.path.join(_data_dir, _djvu_dir, 'not_existing.djvu') + file_djvu = os.path.join(_data_dir, _djvu_dir, 'myfile.djvu') + file_djvu_wo_text = os.path.join(_data_dir, _djvu_dir, 'myfile_wo_text.djvu') + test_txt = 'A file with non-ASCII characters, \nlike é or ç' + + @classmethod + def setUpClass(cls): + """SetUp tests.""" + super(TestDjVuFile, cls).setUpClass() + try: + subprocess.Popen(['djvudump'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except OSError: + raise unittest.SkipTest('djvulibre library not installed.') + + def test_file_existance(self): + """Test file existence checks.""" + djvu = DjVuFile(self.file_djvu) + self.assertEqual(self.file_djvu, djvu.file_djvu) + self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing) + + def test_number_of_images(self): + """Test page number generator.""" + djvu = DjVuFile(self.file_djvu) + self.assertEqual(djvu.number_of_images(), 4) + + def test_has_text(self): + """Test if djvu file contains text.""" + djvu = DjVuFile(self.file_djvu) + self.assertTrue(djvu.has_text()) + djvu = DjVuFile(self.file_djvu_wo_text) + self.assertFalse(djvu.has_text()) + + def test_get_existing_page_number(self): + """Test if djvu file contains text.""" + djvu = DjVuFile(self.file_djvu) + self.assertTrue(djvu.has_text()) + txt = djvu.get_page(1) + self.assertEqual(txt, self.test_txt) + + def test_get_not_existing_page_number(self): + """Test if djvu file contains text.""" + djvu = DjVuFile(self.file_djvu) + self.assertTrue(djvu.has_text()) + self.assertRaises(ValueError, djvu.get_page, 100) + + def test_get_not_existing_page(self): + """Test if djvu file contains text.""" + djvu = DjVuFile(self.file_djvu_wo_text) + self.assertFalse(djvu.has_text()) + self.assertRaises(ValueError, djvu.get_page, 100) + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass -- To view, visit https://gerrit.wikimedia.org/r/210808 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb Gerrit-PatchSet: 17 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Mpaa <[email protected]> Gerrit-Reviewer: John Vandenberg <[email protected]> Gerrit-Reviewer: Ladsgroup <[email protected]> Gerrit-Reviewer: Merlijn van Deen <[email protected]> Gerrit-Reviewer: Mpaa <[email protected]> Gerrit-Reviewer: Ricordisamoa <[email protected]> Gerrit-Reviewer: XZise <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ Pywikibot-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits
