jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/483939 )

Change subject: [IMPR] use textfile for interwiki dumps
......................................................................

[IMPR] use textfile for interwiki dumps

Also enable -restore:all option

pickle format was introduced with core branch for interwiki dumps but
text format is more applicable for bot owners. Go back to the old
compat format.

- Introduce a new interwikidumps.py maintenance script
  which converts dump files from old pickle format to new txt format
- All dump handling is done by the new InterwikiDumps class
- remove InterwikiBot.dump() method which is no longer needed
- add new InterwikiBot.dump_titles property which is a generator yielding
  all page titles to be dumped

Bug: T74943
Bug: T213624
Change-Id: Ie7380d587aab42ace158335de4f41fe9a5709700
---
M docs/scripts/scripts.maintenance.rst
M scripts/README.rst
M scripts/interwiki.py
A scripts/maintenance/interwikidumps.py
M tox.ini
5 files changed, 245 insertions(+), 63 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/docs/scripts/scripts.maintenance.rst 
b/docs/scripts/scripts.maintenance.rst
index b240a49..c7e5bf9 100644
--- a/docs/scripts/scripts.maintenance.rst
+++ b/docs/scripts/scripts.maintenance.rst
@@ -22,6 +22,11 @@

 .. automodule:: scripts.maintenance.compat2core

+scripts.maintenance.interwikidumps script
+-----------------------------------------
+
+.. automodule:: scripts.maintenance.interwikidumps
+
 scripts.maintenance.make\_i18n\_dict script
 -------------------------------------------

diff --git a/scripts/README.rst b/scripts/README.rst
index 5130594..c8df3da 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -281,6 +281,8 @@
     | compat2core.py         | Helper script to convert compat 1.0 scripts to 
the core |
     |                        | 3.0 framework. Also works for newer Pywikibot 
releases. |
     
+------------------------+---------------------------------------------------------+
+    | interwikidumps.py      | Convert interwiki dumps from pickle to txt 
format.      |
+    
+------------------------+---------------------------------------------------------+
     | make_i18n_dict.py      | Generate a i18n file from a given script.       
        |
     
+------------------------+---------------------------------------------------------+
     | sorting_order.py       | Updates interwiki sorting order in family.py 
file.      |
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index 6cf62dd..08ab1e3 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -335,7 +335,6 @@
 #
 import codecs
 import os
-import pickle
 import re
 import socket
 import sys
@@ -350,7 +349,7 @@
 from pywikibot import config, i18n, pagegenerators, textlib, interwiki_graph
 from pywikibot import titletranslate

-from pywikibot.bot import ListOption, StandardOption
+from pywikibot.bot import OptionHandler, ListOption, StandardOption
 from pywikibot.cosmetic_changes import moved_links
 from pywikibot.tools import first_upper
 from pywikibot.tools.formatter import color_format
@@ -1935,26 +1934,8 @@

     @property
     def dump_titles(self):
-        """Return list of titles for dump file."""
-        return [s.origin.title() for s in self.subjects]
-
-    def dump(self, append=True):
-        """Write dump file."""
-        site = pywikibot.Site()
-        dumpfn = pywikibot.config.datafilepath(
-            'data',
-            'interwiki-dumps',
-            '{0}-{1}.pickle'.format(site.family.name, site.code)
-        )
-        if append:
-            mode = 'appended'
-        else:
-            mode = 'written'
-        with open(dumpfn, mode[0] + 'b') as f:
-            pickle.dump(self.dump_titles, f, protocol=config.pickle_protocol)
-        pywikibot.output('Dump {0} ({1}) {2}.'
-                         .format(site.code, site.family.name, mode))
-        return dumpfn
+        """Return generator of titles for dump file."""
+        return (s.origin.title(as_link=True) for s in self.subjects)

     def generateMore(self, number):
         """Generate more subjects.
@@ -2264,6 +2245,141 @@
     return False


+class InterwikiDumps(OptionHandler):
+
+    """Handle interwiki dumps."""
+
+    available_options = {
+        'do_continue': False,
+        'restore_all': False
+    }
+
+    FILE_PATTERN = '{site.family.name}-{site.code}.txt'
+
+    def __init__(self, **kwargs):
+        """Initializer.
+
+        @keyword do_continue: If true, continue alphabetically starting at the
+            last of the dumped pages.
+        """
+        self.site = kwargs.pop('site', pywikibot.Site())
+        super().__init__(**kwargs)
+
+        self.restored_files = set()
+        self._next_page = '!'
+        self._next_namespace = 0
+        self.path = pywikibot.config.datafilepath('data', 'interwiki-dumps')
+
+    @property
+    def next_page(self):
+        """Return next page title string for continue option."""
+        if self._next_page == '!':
+            pywikibot.output('Dump file is empty! Starting at the beginning.')
+        return self._next_page
+
+    @property
+    def next_namespace(self):
+        """Return next page namespace for continue option."""
+        return self._next_namespace
+
+    def remove(self, filename: str):
+        """Remove filename from restored files.
+
+        @param filename: A filename to be removed from restored set.
+        """
+        with suppress(KeyError):
+            self.restored_files.remove(filename)
+
+    def get_files(self, mode='txt'):
+        """Get dump files from directory."""
+        pattern = (r'(?P<file>\A(?P<fam>[a-z]+)-(?P<code>[a-z]+)\.{}\Z)'
+                   .format(mode))
+        for filename in os.listdir(self.path):
+            found = re.match(pattern, filename)
+            if found:
+                yield (found['file'],
+                       pywikibot.Site(found['code'], found['fam']))
+
+    @property
+    def files(self):
+        """Return file generator depending on restore_all option.
+
+        rtype: generator
+        """
+        if self.opt.restore_all:
+            return self.get_files()
+        return iter([(self.FILE_PATTERN.format(site=self.site), self.site)])
+
+    def read_dump(self):
+        """Read the dump file.
+
+        @rtype: generator
+        """
+        for tail, site in self.files:
+            filename = os.path.join(self.path, tail)
+
+            if not os.path.exists(filename):
+                pywikibot.output(tail + ' does not exist.')
+            else:
+                pywikibot.output('Retrieving pages from dump file ' + tail)
+                for page in pagegenerators.TextfilePageGenerator(
+                        filename, site):
+                    if site == self.site:
+                        self._next_page = page.title(with_ns=False) + '!'
+                        self._next_namespace = page.namespace()
+                    yield page
+                else:
+                    self.restored_files.add(filename)
+
+        if self.opt.do_continue:
+            yield from self.site.allpages(start=self.next_page,
+                                          namespace=self.next_namespace,
+                                          filterredir=False)
+
+    def write_dump(self, iterable, append: bool = True):
+        """Write dump file.
+
+        @param iterable: an iterable of page titles to be dumped.
+        @type iterable: iterable
+        @param append: if a dump already exits, append the page titles to it
+            if True else overwrite it.
+        """
+        filename = os.path.join(self.path,
+                                self.FILE_PATTERN.format(site=self.site))
+        mode = 'appended' if append else 'written'
+        with codecs.open(filename, mode[0], 'utf-8') as f:
+            f.write('\r\n'.join(iterable))
+            f.write('\r\n')
+        pywikibot.output('Dump {site.code} ({site.family.name}) {mode}.'
+                         .format(site=self.site, mode=mode))
+        self.remove(filename)
+
+    def delete_dumps(self):
+        """Delete processed dumps."""
+        for filename in self.restored_files:
+            tail = os.path.split(filename)[-1]
+            try:
+                os.remove(filename)
+                pywikibot.output('Dumpfile {0} deleted'.format(tail))
+            except OSError as e:
+                pywikibot.error('Cannot delete {} due to\n{}\nDo it manually.'
+                                .format(tail, e))
+
+    def old_dumps_found(self) -> bool:
+        """Check whether dumps are in old format.
+
+        @return: True if there are dumps in pickle format, False otherwise
+        """
+        try:
+            next(self.get_files(mode='pickle'))
+        except StopIteration:
+            return False
+        pywikibot.warning(fill(
+            'The pickle format is deprecated. Use maintenance script '
+            'interwikidumps.py to convert pickle files into text files.'))
+        return True
+
+
 def main(*args):
     """
     Process command line arguments and invoke bot.
@@ -2284,8 +2400,6 @@
     hintlessPageGen = None
     optContinue = False
     optRestore = False
-    restoredFiles = []
-    dumpFileName = ''
     append = True
     newPages = None

@@ -2356,6 +2470,9 @@
     mainpagename = site.siteinfo['mainpage']
     iwconf.skip.add(pywikibot.Page(site, mainpagename))

+    dump = InterwikiDumps(site=site, do_continue=optContinue,
+                          restore_all=iwconf.restore_all)
+
     if newPages is not None:
         if len(namespaces) == 0:
             ns = 0
@@ -2374,34 +2491,10 @@
                                                                namespaces=ns)

     elif optRestore or optContinue or iwconf.restore_all:
-        dumpFileName = pywikibot.config.datafilepath(
-            'data',
-            'interwiki-dumps',
-            '{0}-{1}.pickle'.format(site.family.name, site.code)
-        )
-        try:
-            with open(dumpFileName, 'rb') as f:
-                dumpedTitles = pickle.load(f)
-        except (EOFError, IOError):
-            dumpedTitles = []
-        pages = [pywikibot.Page(site, title) for title in dumpedTitles]
-
-        hintlessPageGen = iter(pages)
-        if optContinue:
-            if pages:
-                last = pages[-1]
-                nextPage = last.title(with_ns=False) + '!'
-                namespace = last.namespace()
-            else:
-                pywikibot.output(
-                    'Dump file is empty?! Starting at the beginning.')
-                nextPage = '!'
-                namespace = 0
-            gen2 = site.allpages(start=nextPage,
-                                 namespace=namespace,
-                                 filterredir=False)
-            hintlessPageGen = chain(hintlessPageGen, gen2)
-        restoredFiles.append(dumpFileName)
+        if dump.old_dumps_found():
+            # There are dumps is pickle format; they must be converted first.
+            return
+        hintlessPageGen = dump.read_dump()

     bot = InterwikiBot(iwconf)

@@ -2426,21 +2519,14 @@
     try:
         bot.run()
     except KeyboardInterrupt:
-        dumpFileName = bot.dump(append)
+        dump.write_dump(bot.dump_titles, append)
     except Exception:
         pywikibot.exception()
-        dumpFileName = bot.dump(append)
+        dump.write_dump(bot.dump_titles, append)
     else:
         pywikibot.output('Script terminated sucessfully.')
     finally:
-        if dumpFileName:
-            with suppress(ValueError):
-                restoredFiles.remove(dumpFileName)
-        for dumpFileName in restoredFiles:
-            with suppress(OSError):
-                os.remove(dumpFileName)
-                pywikibot.output('Dumpfile {0} deleted'
-                                 .format(dumpFileName.split('\\')[-1]))
+        dump.delete_dumps()


 if __name__ == '__main__':
diff --git a/scripts/maintenance/interwikidumps.py 
b/scripts/maintenance/interwikidumps.py
new file mode 100644
index 0000000..7a1b5cf
--- /dev/null
+++ b/scripts/maintenance/interwikidumps.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Script to convert interwiki dumps from pickle format to txt format."""
+#
+# (C) Pywikibot team, 2019-2020
+#
+# Distributed under the terms of the MIT license.
+#
+import codecs
+import os
+import pickle
+import re
+
+import pywikibot
+
+from pywikibot import config2 as config
+
+
+def pickle_files(path):
+    """Retrieve pickle files."""
+    pattern = r'(?P<old>(?P<new>\A(?P<fam>[a-z]+)-(?P<code>[a-z]+)\.)pickle\Z)'
+    for filename in os.listdir(path):
+        found = re.match(pattern, filename)
+        if not found:
+            continue
+
+        old = found['old']
+        if os.path.exists(os.path.join(path, old)):
+            yield (old, found['new'] + 'txt',
+                   pywikibot.Site(found['code'], found['fam']))
+
+
+def read_content(filename):
+    """Read content of pickle file."""
+    try:
+        with open(filename, 'rb') as f:
+            titles = pickle.load(f)
+    except (EOFError, IOError):
+        pywikibot.exception()
+        titles = None
+    return titles
+
+
+def write_content(filename, site, content):
+    """Write content to txt file."""
+    titles = [pywikibot.Page(site, title).title(as_link=True)
+              for title in content]
+    with codecs.open(filename, 'w', 'utf-8') as f:
+        f.write('\r\n'.join(titles))
+        f.write('\r\n')
+
+
+def convert_dumps():
+    """Convert interwikidump from pickle format to txt format."""
+    folder = config.datafilepath('data', 'interwiki-dumps')
+    for old_file, new_file, site in pickle_files(folder):
+        # read old file
+        pywikibot.output('\nReading {}...'.format(old_file))
+        old_filepath = os.path.join(folder, old_file)
+        titles = read_content(old_filepath)
+
+        if not titles:
+            pywikibot.error('Unable to read ' + old_file)
+            continue
+
+        # write new file
+        pywikibot.output('Writing {}...'.format(new_file))
+        write_content(os.path.join(folder, new_file), site, titles)
+
+        # delete old file
+        try:
+            os.remove(old_filepath)
+            pywikibot.output('Old dumpfile {} deleted'.format(old_file))
+        except OSError as e:
+            pywikibot.error('Cannot delete {} due to\n{}\nDo it manually.'
+                            .format(old_file, e))
+
+
+def main(*args):
+    """Main function."""
+    args = pywikibot.argvu[1:]
+    if args and args[0] == '-help':
+        pywikibot.output(__doc__)
+    else:
+        convert_dumps()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tox.ini b/tox.ini
index 6f62d87..bdac3fa 100644
--- a/tox.ini
+++ b/tox.ini
@@ -175,7 +175,7 @@
     scripts/imagerecat.py : N803, N806, N802
     scripts/imagetransfer.py : N803, N806, N816
     scripts/imageuncat.py: N802, N816
-    scripts/interwiki.py : N802, N803, N806, N816
+    scripts/interwiki.py : N802, N803, N806, N815, N816
     scripts/isbn.py : N802, N803, N806, N816
     scripts/maintenance/* : T001
     scripts/maintenance/download_dump.py : N815

--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/483939
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie7380d587aab42ace158335de4f41fe9a5709700
Gerrit-Change-Number: 483939
Gerrit-PatchSet: 15
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: Dalba <[email protected]>
Gerrit-Reviewer: Dvorapa <[email protected]>
Gerrit-Reviewer: JAn Dudík <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to