jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/776176 )

Change subject: [IMPR] Port CommonsDelinker to core
......................................................................

[IMPR] Port CommonsDelinker to core

This is an initial rewrite of compat's CommonsDelinker.
It reads the local deletion log and shared repository deletion log and
delinks local references.

Also backport image_regex to image.py

Bug: T299563
Change-Id: Ib7b7405115b485d4f404aedecc0146bb30c21468
---
M docs/scripts/unsorted.rst
M docs/scripts_ref/scripts.rst
M scripts/README.rst
A scripts/delinker.py
M scripts/image.py
M tests/script_tests.py
6 files changed, 181 insertions(+), 2 deletions(-)

Approvals:
  Rubin: Looks good to me, but someone else must approve
  Xqt: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/docs/scripts/unsorted.rst b/docs/scripts/unsorted.rst
index ec81ef4..f7686d2 100644
--- a/docs/scripts/unsorted.rst
+++ b/docs/scripts/unsorted.rst
@@ -19,6 +19,11 @@
 .. automodule:: scripts.coordinate_import
    :no-members:
 
+delinker script
+---------------
+.. automodule:: scripts.delinker
+   :no-members:
+
 djvutext script
 ---------------
 .. automodule:: scripts.djvutext
diff --git a/docs/scripts_ref/scripts.rst b/docs/scripts_ref/scripts.rst
index b40ab91..d4c262d 100644
--- a/docs/scripts_ref/scripts.rst
+++ b/docs/scripts_ref/scripts.rst
@@ -97,6 +97,11 @@

 .. automodule:: scripts.delete

+delinker script
+---------------
+
+.. automodule:: scripts.delinker
+
 djvutext script
 ---------------
 .. automodule:: scripts.djvutext
diff --git a/scripts/README.rst b/scripts/README.rst
index d8311a4..7282cc6 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -58,6 +58,8 @@
     
+------------------------+---------------------------------------------------------+
     | delete.py              | This script can be used to delete pages en 
masse.       |
     
+------------------------+---------------------------------------------------------+
+    | delinker.py            | Delink file references of deleted images.       
        |
+    
+------------------------+---------------------------------------------------------+
     | djvutext.py            | Extracts OCR text from djvu files and uploads 
onto      |
     |                        | pages in the "Page" namespace on Wikisource.    
        |
     
+------------------------+---------------------------------------------------------+
diff --git a/scripts/delinker.py b/scripts/delinker.py
new file mode 100644
index 0000000..9ee7e59
--- /dev/null
+++ b/scripts/delinker.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python3
+"""Delink removed files from wiki.
+
+This script keeps track of image deletions and delinks removed files
+from current wiki in namespace 0. This script is suitable to delink
+files from a image repository as well as for local images.
+
+The following parameters are supported:
+
+-exclude:   If the deletion log contains this pattern, the file is not
+            delinked (default is 'no-delink').
+
+-localonly  Retrieve deleted File pages from local log only
+
+-since:     Start the deletion log with this timestamp given in MediaWiki
+            timestamp format. If no `-since` option is given, the start
+            timestamp is read from setting file. If the option is empty,
+            the processing starts from the very beginning. If the script
+            stops, the last timestamp is written to the settings file and
+            the next script call starts there if no `-since` is given.
+
+.. note:: This sample script is a
+   :class:`ConfigParserBot <pywikibot.bot.ConfigParserBot>`. All
+   settings can be made either by giving option with the command line or
+   with a settings file which is scripts.ini by default. If you don't
+   want the default values you can add any option you want to change to
+   that settings file below the [delinker] section like.
+
+.. versionadded:: 7.2
+   This script is completely rewriten from compat branch.
+"""
+#
+# (C) Pywikibot team, 2006-2022
+#
+# Distributed under the terms of the MIT license.
+#
+import configparser
+import heapq
+import re
+
+import pywikibot
+from pywikibot.backports import removeprefix
+from pywikibot.bot import (
+    ConfigParserBot,
+    AutomaticTWSummaryBot,
+    SingleSiteBot,
+    calledModuleName,
+)
+from pywikibot.textlib import case_escape, ignore_case, replaceExcept
+from pywikibot.tools.formatter import color_format
+
+
+class CommonsDelinker(SingleSiteBot, ConfigParserBot, AutomaticTWSummaryBot):
+
+    """Bot to delink deleted images."""
+
+    update_options = {
+        'exclude': 'no-delink',
+        'localonly': False,
+        'since': '',
+    }
+    summary_key = 'delinker-delink'
+
+    @property
+    def generator(self):
+        """Read deletion logs and yield the oldest entry first."""
+        ts = (pywikibot.Timestamp.fromtimestampformat(self.opt.since)
+              if self.opt.since else None)
+        params = {
+            'logtype': 'delete',
+            'namespace': 6,
+            'reverse': True,
+            'start': ts,
+        }
+
+        iterables = [self.site.logevents(**params)]
+        repo = self.site.image_repository() if not self.opt.localonly else None
+        if repo:
+            iterables.append(repo.logevents(**params))
+
+        for entry in heapq.merge(*iterables,
+                                 key=lambda event: event.timestamp()):
+            self.last_ts = entry.timestamp()
+            if entry['action'] == 'delete' \
+               and self.opt.exclude not in entry.get('comment', ''):
+                yield entry
+
+    def init_page(self, item) -> 'pywikibot.page.FilePage':
+        """Upcast logevent to FilePage and combine edit summary."""
+        self.summary_parameters = dict(item)
+        return pywikibot.FilePage(self.site, item['title'])
+
+    def skip_page(self, page) -> bool:
+        """Skip pages which neither exists locally nor on shared repository."""
+        pywikibot.output('.', newline=False)
+        if page.file_is_shared() or page.exists():
+            return True
+        return super().skip_page(page)
+
+    def treat(self, file_page):
+        """Set page to current page and delink that page."""
+        # use image_regex from image.py
+        namespace = file_page.site.namespaces[6]
+        escaped = case_escape(namespace.case, file_page.title(with_ns=False))
+        # Be careful, spaces and _ have been converted to '\ ' and '\_'
+        escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
+        self.image_regex = re.compile(
+            r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
+            r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
+            .format('|'.join(ignore_case(s) for s in namespace), escaped))
+
+        shown = False
+        for page in file_page.usingPages(content=True, namespaces=0):
+            if not shown:
+                pywikibot.output(
+                    color_format('\n>>> {lightgreen}Delinking {}{default} <<<',
+                                 file_page.title()))
+                shown = True
+            super().treat(page)
+
+    def treat_page(self):
+        """Delink a single page."""
+        new = replaceExcept(self.current_page.text, self.image_regex, '', [])
+        self.put_current(new)
+
+    def teardown(self):
+        """Save the last used logevent timestamp."""
+        if not hasattr(self, 'last_ts'):
+            return
+
+        pywikibot.output("\nUpdate 'since' to {} file".format(self.INI))
+        conf = configparser.ConfigParser(inline_comment_prefixes=[';'])
+        conf.read(self.INI)
+        section = calledModuleName()
+        if not conf.has_section(section):
+            conf.add_section(section)
+        conf.set(section, 'since', self.last_ts.totimestampformat())
+        with open(self.INI, 'w') as f:
+            conf.write(f)
+
+
+def main(*args: str) -> None:
+    """
+    Process command line arguments and invoke bot.
+
+    If args is an empty list, sys.argv is used.
+
+    :param args: command line arguments
+    """
+    options = {}
+    local_args = pywikibot.handle_args()
+    for arg in local_args:
+        opt, _, value = arg.partition(':')
+        opt = removeprefix(opt, '-')
+        if opt == 'localonly':
+            options[opt] = True
+        else:
+            options[opt] = value
+
+    bot = CommonsDelinker(site=pywikibot.Site(), **options)
+    bot.run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/image.py b/scripts/image.py
index 84785e7..f29c5f7 100755
--- a/scripts/image.py
+++ b/scripts/image.py
@@ -91,7 +91,8 @@
         escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
         if not self.opt.loose or not self.new_image:
             image_regex = re.compile(
-                r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|[^\n]+?|) *\]\]'
+                r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
+                r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
                 .format('|'.join(ignore_case(s) for s in namespace), escaped))
         else:
             image_regex = re.compile(r'' + escaped)
diff --git a/tests/script_tests.py b/tests/script_tests.py
index 2ab29cd..5b7e1b4 100755
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -79,13 +79,14 @@
     'category_redirect',
     'checkimages',
     'clean_sandbox',
+    'delinker',
     'login',
     'misspelling',
-    'revertbot',
     'noreferences',
     'nowcommons',
     'parser_function_count',
     'patrol',
+    'revertbot',
     'shell',
     'unusedfiles',
     'upload',

--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/776176
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ib7b7405115b485d4f404aedecc0146bb30c21468
Gerrit-Change-Number: 776176
Gerrit-PatchSet: 20
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: D3r1ck01 <[email protected]>
Gerrit-Reviewer: MarcoAurelio <[email protected]>
Gerrit-Reviewer: Rubin <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: Zabe <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Matěj Suchánek <[email protected]>
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to