jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616214 )
Change subject: [4.0] remove Python 2 related code in weblinkchecker.py
......................................................................
[4.0] remove Python 2 related code in weblinkchecker.py
- also print a FutureWarning for the deprecated LinkChecker class
which is not used inside a script
- decrease nested flow statements in DeadLinkReportThread.run
- speed up inner loop of treat_page
Change-Id: I8c601223ef31974bf78becfec06cccb276dd3bd4
---
M scripts/weblinkchecker.py
1 file changed, 100 insertions(+), 126 deletions(-)
Approvals:
Hazard-SJ: Looks good to me, but someone else must approve
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 9b5e9de..8d4fd46 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -109,17 +109,32 @@
#
# Distributed under the terms of the MIT license.
#
-from __future__ import absolute_import, division, unicode_literals
-
import codecs
import datetime
+import http.client as httpclient
import pickle
import re
import socket
import threading
import time
+from contextlib import suppress
from functools import partial
+from typing import Optional, Tuple
+from urllib.parse import urlsplit
+from urllib.request import quote
+
+import requests
+
+import pywikibot
+
+from pywikibot import comms, i18n, config, pagegenerators, textlib, config2
+from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help
+from pywikibot.pagegenerators import (
+ XMLDumpPageGenerator as _XMLDumpPageGenerator,
+)
+from pywikibot.tools import deprecated
+from pywikibot.tools.formatter import color_format
try:
import memento_client
@@ -127,29 +142,6 @@
except ImportError as e:
memento_client = e
-import pywikibot
-
-from pywikibot import (
- comms, i18n, config, pagegenerators, textlib, config2,
-)
-
-from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help
-from pywikibot.pagegenerators import (
- XMLDumpPageGenerator as _XMLDumpPageGenerator,
-)
-from pywikibot.tools import deprecated, PY2
-from pywikibot.tools.formatter import color_format
-
-import requests
-
-if not PY2:
- import http.client as httplib
- import urllib.parse as urlparse
- import urllib.request as urllib
-else:
- import httplib
- import urllib
- import urlparse
docuReplacements = {'¶ms;': pagegenerators.parameterHelp} # noqa: N816
@@ -288,8 +280,8 @@
"""The link is not an URL."""
-@deprecated('requests', since='20160120')
-class LinkChecker(object):
+@deprecated('requests', since='20160120', future_warning=True)
+class LinkChecker:
"""
Check links.
@@ -342,16 +334,16 @@
def getConnection(self):
"""Get a connection."""
if self.scheme == 'http':
- return httplib.HTTPConnection(self.host)
+ return httpclient.HTTPConnection(self.host)
elif self.scheme == 'https':
- return httplib.HTTPSConnection(self.host)
+ return httpclient.HTTPSConnection(self.host)
else:
raise NotAnURLError(self.url)
def getEncodingUsedByServer(self):
"""Get encodung used by server."""
if not self.serverEncoding:
- try:
+ with suppress(Exception):
pywikibot.output(
'Contacting server %s to find out its default encoding...'
% self.host)
@@ -359,8 +351,7 @@
conn.request('HEAD', '/', None, self.header)
self.response = conn.getresponse()
self.readEncodingFromResponse(self.response)
- except Exception:
- pass
+
if not self.serverEncoding:
# TODO: We might also load a page, then check for an encoding
# definition in a HTML meta tag.
@@ -373,20 +364,18 @@
def readEncodingFromResponse(self, response):
"""Read encoding from response."""
if not self.serverEncoding:
- try:
+ with suppress(Exception):
ct = response.getheader('Content-Type')
charsetR = re.compile('charset=(.+)')
charset = charsetR.search(ct).group(1)
self.serverEncoding = charset
- except Exception:
- pass
def changeUrl(self, url):
"""Change url."""
self.url = url
# we ignore the fragment
(self.scheme, self.host, self.path, self.query,
- self.fragment) = urlparse.urlsplit(self.url)
+ self.fragment) = urlsplit(self.url)
if not self.path:
self.path = '/'
if self.query:
@@ -399,17 +388,15 @@
self.query.encode('ascii')
except UnicodeEncodeError:
encoding = self.getEncodingUsedByServer()
- self.path = urllib.quote(self.path.encode(encoding))
- self.query = urllib.quote(self.query.encode(encoding), '=&')
+ self.path = quote(self.path.encode(encoding))
+ self.query = quote(self.query.encode(encoding), '=&')
- def resolveRedirect(self, useHEAD=False):
+ def resolveRedirect(self, useHEAD=False) -> Optional[str]:
"""
Return the redirect target URL as a string, if it is a HTTP redirect.
If useHEAD is true, uses the HTTP HEAD method, which saves bandwidth
by not downloading the body. Otherwise, the HTTP GET method is used.
-
- @rtype: str or None
"""
conn = self.getConnection()
try:
@@ -422,7 +409,7 @@
self.response = conn.getresponse()
# read the server's encoding, in case we need it later
self.readEncodingFromResponse(self.response)
- except httplib.BadStatusLine:
+ except httpclient.BadStatusLine:
# Some servers don't seem to handle HEAD requests properly,
# e.g. http://www.radiorus.ru/ which is running on a very old
# Apache server. Using GET instead works on these (but it uses
@@ -467,18 +454,14 @@
else:
return False # not a redirect
- def check(self, useHEAD=False):
- """
- Return True and the server status message if the page is alive.
-
- @rtype: tuple of (bool, unicode)
- """
+ def check(self, useHEAD=False) -> Tuple[bool, str]:
+ """Return True and the server status message if the page is alive."""
try:
wasRedirected = self.resolveRedirect(useHEAD=useHEAD)
except UnicodeError as error:
return False, 'Encoding Error: {0} ({1})'.format(
error.__class__.__name__, error)
- except httplib.error as error:
+ except httpclient.error as error:
return False, 'HTTP Error: {}'.format(error.__class__.__name__)
except socket.error as error:
# https://docs.python.org/3/library/socket.html :
@@ -542,7 +525,7 @@
else:
try:
conn = self.getConnection()
- except httplib.error as error:
+ except httpclient.error as error:
return False, 'HTTP Error: {0}'.format(
error.__class__.__name__)
try:
@@ -573,7 +556,7 @@
def __init__(self, page, url, history, HTTPignore, day):
"""Initializer."""
- threading.Thread.__init__(self)
+ super().__init__()
self.page = page
self.url = url
self.history = history
@@ -625,7 +608,7 @@
config.weblink_dead_days)
-class History(object):
+class History:
"""
Store previously found dead links.
@@ -736,15 +719,11 @@
"""
if url in self.historyDict:
with self.semaphore:
- try:
+ with suppress(KeyError): # Not sure why this can happen
del self.historyDict[url]
- except KeyError:
- # Not sure why this can happen, but I guess we can
- # ignore this.
- pass
return True
- else:
- return False
+
+ return False
def save(self):
"""Save the .dat file to disk."""
@@ -791,65 +770,64 @@
break
else:
time.sleep(0.1)
- else:
- with self.semaphore:
- url, errorReport, containingPage, archiveURL = \
- self.queue[0]
- self.queue = self.queue[1:]
- talkPage = containingPage.toggleTalkPage()
- pywikibot.output(color_format(
- '{lightaqua}** Reporting dead link on '
- '{0}...{default}',
- talkPage.title(as_link=True)))
- try:
- content = talkPage.get() + '\n\n\n'
- if url in content:
- pywikibot.output(color_format(
- '{lightaqua}** Dead link seems to have '
- 'already been reported on {0}{default}',
- talkPage.title(as_link=True)))
- continue
- except (pywikibot.NoPage, pywikibot.IsRedirectPage):
- content = ''
+ continue
- if archiveURL:
- archiveMsg = '\n' + \
- i18n.twtranslate(
- containingPage.site,
- 'weblinkchecker-archive_msg',
- {'URL': archiveURL})
- else:
- archiveMsg = ''
- # The caption will default to "Dead link". But if there
- # is already such a caption, we'll use "Dead link 2",
- # "Dead link 3", etc.
- caption = i18n.twtranslate(containingPage.site,
- 'weblinkchecker-caption')
- i = 1
- count = ''
- # Check if there is already such a caption on
- # the talk page.
- while re.search('= *{0}{1} *='.format(caption, count),
- content) is not None:
- i += 1
- count = ' ' + str(i)
- caption += count
- content += '== {0} ==\n\n{1}\n\n{2}{3}\n--~~~~'.format(
- caption, i18n.twtranslate(containingPage.site,
- 'weblinkchecker-report'),
- errorReport, archiveMsg)
-
- comment = '[[{0}#{1}|→]] {2}'.format(
- talkPage.title(), caption,
- i18n.twtranslate(containingPage.site,
- 'weblinkchecker-summary'))
- try:
- talkPage.put(content, comment)
- except pywikibot.SpamblacklistError as error:
+ with self.semaphore:
+ url, errorReport, containingPage, archiveURL = self.queue[0]
+ self.queue = self.queue[1:]
+ talkPage = containingPage.toggleTalkPage()
+ pywikibot.output(color_format(
+ '{lightaqua}** Reporting dead link on {}...{default}',
+ talkPage))
+ try:
+ content = talkPage.get() + '\n\n\n'
+ if url in content:
pywikibot.output(color_format(
- '{lightaqua}** SpamblacklistError while trying to '
- 'change {0}: {1}{default}',
- talkPage.title(as_link=True), error.url))
+ '{lightaqua}** Dead link seems to have '
+ 'already been reported on {}{default}',
+ talkPage))
+ continue
+ except (pywikibot.NoPage, pywikibot.IsRedirectPage):
+ content = ''
+
+ if archiveURL:
+ archiveMsg = '\n' + \
+ i18n.twtranslate(
+ containingPage.site,
+ 'weblinkchecker-archive_msg',
+ {'URL': archiveURL})
+ else:
+ archiveMsg = ''
+ # The caption will default to "Dead link". But if there
+ # is already such a caption, we'll use "Dead link 2",
+ # "Dead link 3", etc.
+ caption = i18n.twtranslate(containingPage.site,
+ 'weblinkchecker-caption')
+ i = 1
+ count = ''
+ # Check if there is already such a caption on
+ # the talk page.
+ while re.search('= *{0}{1} *='
+ .format(caption, count), content) is not None:
+ i += 1
+ count = ' ' + str(i)
+ caption += count
+ content += '== {0} ==\n\n{3}\n\n{1}{2}\n--~~~~'.format(
+ caption, errorReport, archiveMsg,
+ i18n.twtranslate(containingPage.site,
+ 'weblinkchecker-report'))
+
+ comment = '[[{0}#{1}|→]] {2}'.format(
+ talkPage.title(), caption,
+ i18n.twtranslate(containingPage.site,
+ 'weblinkchecker-summary'))
+ try:
+ talkPage.put(content, comment)
+ except pywikibot.SpamblacklistError as error:
+ pywikibot.output(color_format(
+ '{lightaqua}** SpamblacklistError while trying to '
+ 'change {0}: {1}{default}',
+ talkPage, error.url))
class WeblinkCheckerRobot(SingleSiteBot, ExistingPageBot):
@@ -862,8 +840,7 @@
def __init__(self, generator, HTTPignore=None, day=7, site=True):
"""Initializer."""
- super(WeblinkCheckerRobot, self).__init__(
- generator=generator, site=site)
+ super().__init__(generator=generator, site=site)
if config.report_dead_links_on_talk:
pywikibot.log('Starting talk page thread')
@@ -885,11 +862,10 @@
page = self.current_page
text = page.get()
for url in weblinksIn(text):
- ignoreUrl = False
for ignoreR in ignorelist:
if ignoreR.match(url):
- ignoreUrl = True
- if not ignoreUrl:
+ break
+ else: # not ignore url
# Limit the number of threads started at the same time. Each
# thread will check one page, then die.
while threading.activeCount() >= config.max_external_links:
@@ -921,12 +897,11 @@
yield page
-def countLinkCheckThreads():
+def countLinkCheckThreads() -> int:
"""
Count LinkCheckThread threads.
@return: number of LinkCheckThread threads
- @rtype: int
"""
i = 0
for thread in threading.enumerate():
@@ -935,11 +910,10 @@
return i
-@deprecated('requests', since='20160120')
+@deprecated('requests', since='20160120', future_warning=True)
def check(url):
"""DEPRECATED: Use requests instead. Perform a check on URL."""
- c = LinkChecker(url)
- return c.check()
+ return LinkChecker(url).check()
def main(*args):
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616214
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I8c601223ef31974bf78becfec06cccb276dd3bd4
Gerrit-Change-Number: 616214
Gerrit-PatchSet: 7
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: D3r1ck01 <[email protected]>
Gerrit-Reviewer: Hazard-SJ <[email protected]>
Gerrit-Reviewer: JJMC89 <[email protected]>
Gerrit-Reviewer: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Zhuyifei1999 <[email protected]>
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits