[Pywikibot-commits] [Gerrit] ...core[master]: [4.0] remove Python 2 related code in weblinkchecker.py

jenkins-bot (Code Review) Sun, 04 Oct 2020 22:08:17 -0700

jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616214 )


Change subject: [4.0] remove Python 2 related code in weblinkchecker.py
......................................................................

[4.0] remove Python 2 related code in weblinkchecker.py

- also print a FutureWarning for the deprecated LinkChecker class
  which is not used inside a script
- decrease nested flow statements in DeadLinkReportThread.run
- speed up inner loop of treat_page

Change-Id: I8c601223ef31974bf78becfec06cccb276dd3bd4
---
M scripts/weblinkchecker.py
1 file changed, 100 insertions(+), 126 deletions(-)

Approvals:
  Hazard-SJ: Looks good to me, but someone else must approve
  Xqt: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 9b5e9de..8d4fd46 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -109,17 +109,32 @@
 #
 # Distributed under the terms of the MIT license.
 #
-from __future__ import absolute_import, division, unicode_literals
-
 import codecs
 import datetime
+import http.client as httpclient
 import pickle
 import re
 import socket
 import threading
 import time

+from contextlib import suppress
 from functools import partial
+from typing import Optional, Tuple
+from urllib.parse import urlsplit
+from urllib.request import quote
+
+import requests
+
+import pywikibot
+
+from pywikibot import comms, i18n, config, pagegenerators, textlib, config2
+from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help
+from pywikibot.pagegenerators import (
+    XMLDumpPageGenerator as _XMLDumpPageGenerator,
+)
+from pywikibot.tools import deprecated
+from pywikibot.tools.formatter import color_format

 try:
     import memento_client
@@ -127,29 +142,6 @@
 except ImportError as e:
     memento_client = e

-import pywikibot
-
-from pywikibot import (
-    comms, i18n, config, pagegenerators, textlib, config2,
-)
-
-from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help
-from pywikibot.pagegenerators import (
-    XMLDumpPageGenerator as _XMLDumpPageGenerator,
-)
-from pywikibot.tools import deprecated, PY2
-from pywikibot.tools.formatter import color_format
-
-import requests
-
-if not PY2:
-    import http.client as httplib
-    import urllib.parse as urlparse
-    import urllib.request as urllib
-else:
-    import httplib
-    import urllib
-    import urlparse

 docuReplacements = {'&params;': pagegenerators.parameterHelp}  # noqa: N816

@@ -288,8 +280,8 @@
     """The link is not an URL."""


-@deprecated('requests', since='20160120')
-class LinkChecker(object):
+@deprecated('requests', since='20160120', future_warning=True)
+class LinkChecker:

     """
     Check links.
@@ -342,16 +334,16 @@
     def getConnection(self):
         """Get a connection."""
         if self.scheme == 'http':
-            return httplib.HTTPConnection(self.host)
+            return httpclient.HTTPConnection(self.host)
         elif self.scheme == 'https':
-            return httplib.HTTPSConnection(self.host)
+            return httpclient.HTTPSConnection(self.host)
         else:
             raise NotAnURLError(self.url)

     def getEncodingUsedByServer(self):
         """Get encodung used by server."""
         if not self.serverEncoding:
-            try:
+            with suppress(Exception):
                 pywikibot.output(
                     'Contacting server %s to find out its default encoding...'
                     % self.host)
@@ -359,8 +351,7 @@
                 conn.request('HEAD', '/', None, self.header)
                 self.response = conn.getresponse()
                 self.readEncodingFromResponse(self.response)
-            except Exception:
-                pass
+
             if not self.serverEncoding:
                 # TODO: We might also load a page, then check for an encoding
                 # definition in a HTML meta tag.
@@ -373,20 +364,18 @@
     def readEncodingFromResponse(self, response):
         """Read encoding from response."""
         if not self.serverEncoding:
-            try:
+            with suppress(Exception):
                 ct = response.getheader('Content-Type')
                 charsetR = re.compile('charset=(.+)')
                 charset = charsetR.search(ct).group(1)
                 self.serverEncoding = charset
-            except Exception:
-                pass

     def changeUrl(self, url):
         """Change url."""
         self.url = url
         # we ignore the fragment
         (self.scheme, self.host, self.path, self.query,
-         self.fragment) = urlparse.urlsplit(self.url)
+         self.fragment) = urlsplit(self.url)
         if not self.path:
             self.path = '/'
         if self.query:
@@ -399,17 +388,15 @@
             self.query.encode('ascii')
         except UnicodeEncodeError:
             encoding = self.getEncodingUsedByServer()
-            self.path = urllib.quote(self.path.encode(encoding))
-            self.query = urllib.quote(self.query.encode(encoding), '=&')
+            self.path = quote(self.path.encode(encoding))
+            self.query = quote(self.query.encode(encoding), '=&')

-    def resolveRedirect(self, useHEAD=False):
+    def resolveRedirect(self, useHEAD=False) -> Optional[str]:
         """
         Return the redirect target URL as a string, if it is a HTTP redirect.

         If useHEAD is true, uses the HTTP HEAD method, which saves bandwidth
         by not downloading the body. Otherwise, the HTTP GET method is used.
-
-        @rtype: str or None
         """
         conn = self.getConnection()
         try:
@@ -422,7 +409,7 @@
             self.response = conn.getresponse()
             # read the server's encoding, in case we need it later
             self.readEncodingFromResponse(self.response)
-        except httplib.BadStatusLine:
+        except httpclient.BadStatusLine:
             # Some servers don't seem to handle HEAD requests properly,
             # e.g. http://www.radiorus.ru/ which is running on a very old
             # Apache server. Using GET instead works on these (but it uses
@@ -467,18 +454,14 @@
         else:
             return False  # not a redirect

-    def check(self, useHEAD=False):
-        """
-        Return True and the server status message if the page is alive.
-
-        @rtype: tuple of (bool, unicode)
-        """
+    def check(self, useHEAD=False) -> Tuple[bool, str]:
+        """Return True and the server status message if the page is alive."""
         try:
             wasRedirected = self.resolveRedirect(useHEAD=useHEAD)
         except UnicodeError as error:
             return False, 'Encoding Error: {0} ({1})'.format(
                 error.__class__.__name__, error)
-        except httplib.error as error:
+        except httpclient.error as error:
             return False, 'HTTP Error: {}'.format(error.__class__.__name__)
         except socket.error as error:
             # https://docs.python.org/3/library/socket.html :
@@ -542,7 +525,7 @@
         else:
             try:
                 conn = self.getConnection()
-            except httplib.error as error:
+            except httpclient.error as error:
                 return False, 'HTTP Error: {0}'.format(
                     error.__class__.__name__)
             try:
@@ -573,7 +556,7 @@

     def __init__(self, page, url, history, HTTPignore, day):
         """Initializer."""
-        threading.Thread.__init__(self)
+        super().__init__()
         self.page = page
         self.url = url
         self.history = history
@@ -625,7 +608,7 @@
                                      config.weblink_dead_days)


-class History(object):
+class History:

     """
     Store previously found dead links.
@@ -736,15 +719,11 @@
         """
         if url in self.historyDict:
             with self.semaphore:
-                try:
+                with suppress(KeyError):  # Not sure why this can happen
                     del self.historyDict[url]
-                except KeyError:
-                    # Not sure why this can happen, but I guess we can
-                    # ignore this.
-                    pass
             return True
-        else:
-            return False
+
+        return False

     def save(self):
         """Save the .dat file to disk."""
@@ -791,65 +770,64 @@
                     break
                 else:
                     time.sleep(0.1)
-            else:
-                with self.semaphore:
-                    url, errorReport, containingPage, archiveURL = \
-                        self.queue[0]
-                    self.queue = self.queue[1:]
-                    talkPage = containingPage.toggleTalkPage()
-                    pywikibot.output(color_format(
-                        '{lightaqua}** Reporting dead link on '
-                        '{0}...{default}',
-                        talkPage.title(as_link=True)))
-                    try:
-                        content = talkPage.get() + '\n\n\n'
-                        if url in content:
-                            pywikibot.output(color_format(
-                                '{lightaqua}** Dead link seems to have '
-                                'already been reported on {0}{default}',
-                                talkPage.title(as_link=True)))
-                            continue
-                    except (pywikibot.NoPage, pywikibot.IsRedirectPage):
-                        content = ''
+                    continue

-                    if archiveURL:
-                        archiveMsg = '\n' + \
-                                     i18n.twtranslate(
-                                         containingPage.site,
-                                         'weblinkchecker-archive_msg',
-                                         {'URL': archiveURL})
-                    else:
-                        archiveMsg = ''
-                    # The caption will default to "Dead link". But if there
-                    # is already such a caption, we'll use "Dead link 2",
-                    # "Dead link 3", etc.
-                    caption = i18n.twtranslate(containingPage.site,
-                                               'weblinkchecker-caption')
-                    i = 1
-                    count = ''
-                    # Check if there is already such a caption on
-                    # the talk page.
-                    while re.search('= *{0}{1} *='.format(caption, count),
-                                    content) is not None:
-                        i += 1
-                        count = ' ' + str(i)
-                    caption += count
-                    content += '== {0} ==\n\n{1}\n\n{2}{3}\n--~~~~'.format(
-                        caption, i18n.twtranslate(containingPage.site,
-                                                  'weblinkchecker-report'),
-                        errorReport, archiveMsg)
-
-                    comment = '[[{0}#{1}|→]] {2}'.format(
-                        talkPage.title(), caption,
-                        i18n.twtranslate(containingPage.site,
-                                         'weblinkchecker-summary'))
-                    try:
-                        talkPage.put(content, comment)
-                    except pywikibot.SpamblacklistError as error:
+            with self.semaphore:
+                url, errorReport, containingPage, archiveURL = self.queue[0]
+                self.queue = self.queue[1:]
+                talkPage = containingPage.toggleTalkPage()
+                pywikibot.output(color_format(
+                    '{lightaqua}** Reporting dead link on {}...{default}',
+                    talkPage))
+                try:
+                    content = talkPage.get() + '\n\n\n'
+                    if url in content:
                         pywikibot.output(color_format(
-                            '{lightaqua}** SpamblacklistError while trying to '
-                            'change {0}: {1}{default}',
-                            talkPage.title(as_link=True), error.url))
+                            '{lightaqua}** Dead link seems to have '
+                            'already been reported on {}{default}',
+                            talkPage))
+                        continue
+                except (pywikibot.NoPage, pywikibot.IsRedirectPage):
+                    content = ''
+
+                if archiveURL:
+                    archiveMsg = '\n' + \
+                                 i18n.twtranslate(
+                                     containingPage.site,
+                                     'weblinkchecker-archive_msg',
+                                     {'URL': archiveURL})
+                else:
+                    archiveMsg = ''
+                # The caption will default to "Dead link". But if there
+                # is already such a caption, we'll use "Dead link 2",
+                # "Dead link 3", etc.
+                caption = i18n.twtranslate(containingPage.site,
+                                           'weblinkchecker-caption')
+                i = 1
+                count = ''
+                # Check if there is already such a caption on
+                # the talk page.
+                while re.search('= *{0}{1} *='
+                                .format(caption, count), content) is not None:
+                    i += 1
+                    count = ' ' + str(i)
+                caption += count
+                content += '== {0} ==\n\n{3}\n\n{1}{2}\n--~~~~'.format(
+                    caption, errorReport, archiveMsg,
+                    i18n.twtranslate(containingPage.site,
+                                     'weblinkchecker-report'))
+
+                comment = '[[{0}#{1}|→]] {2}'.format(
+                    talkPage.title(), caption,
+                    i18n.twtranslate(containingPage.site,
+                                     'weblinkchecker-summary'))
+                try:
+                    talkPage.put(content, comment)
+                except pywikibot.SpamblacklistError as error:
+                    pywikibot.output(color_format(
+                        '{lightaqua}** SpamblacklistError while trying to '
+                        'change {0}: {1}{default}',
+                        talkPage, error.url))


 class WeblinkCheckerRobot(SingleSiteBot, ExistingPageBot):
@@ -862,8 +840,7 @@

     def __init__(self, generator, HTTPignore=None, day=7, site=True):
         """Initializer."""
-        super(WeblinkCheckerRobot, self).__init__(
-            generator=generator, site=site)
+        super().__init__(generator=generator, site=site)

         if config.report_dead_links_on_talk:
             pywikibot.log('Starting talk page thread')
@@ -885,11 +862,10 @@
         page = self.current_page
         text = page.get()
         for url in weblinksIn(text):
-            ignoreUrl = False
             for ignoreR in ignorelist:
                 if ignoreR.match(url):
-                    ignoreUrl = True
-            if not ignoreUrl:
+                    break
+            else:  # not ignore url
                 # Limit the number of threads started at the same time. Each
                 # thread will check one page, then die.
                 while threading.activeCount() >= config.max_external_links:
@@ -921,12 +897,11 @@
         yield page


-def countLinkCheckThreads():
+def countLinkCheckThreads() -> int:
     """
     Count LinkCheckThread threads.

     @return: number of LinkCheckThread threads
-    @rtype: int
     """
     i = 0
     for thread in threading.enumerate():
@@ -935,11 +910,10 @@
     return i


-@deprecated('requests', since='20160120')
+@deprecated('requests', since='20160120', future_warning=True)
 def check(url):
     """DEPRECATED: Use requests instead. Perform a check on URL."""
-    c = LinkChecker(url)
-    return c.check()
+    return LinkChecker(url).check()


 def main(*args):

--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616214
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I8c601223ef31974bf78becfec06cccb276dd3bd4
Gerrit-Change-Number: 616214
Gerrit-PatchSet: 7
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: D3r1ck01 <[email protected]>
Gerrit-Reviewer: Hazard-SJ <[email protected]>
Gerrit-Reviewer: JJMC89 <[email protected]>
Gerrit-Reviewer: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Zhuyifei1999 <[email protected]>
Gerrit-MessageType: merged

_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

[Pywikibot-commits] [Gerrit] ...core[master]: [4.0] remove Python 2 related code in weblinkchecker.py

Reply via email to