jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/632195 )
Change subject: [bugfix] decode byte-like object meta_content.group() in
reflinks.py
......................................................................
[bugfix] decode byte-like object meta_content.group() in reflinks.py
- meta_content.group() is bytes but str is needed for the remaining
code; decode it.
- Derive ReferencesRobot from SingleSiteMot
- setup dead_links in setup() method
- use new opt options
- simplify format strings
- use new opt options in noreferences.py
Bug: T264575
Change-Id: Ib076abaa58a963634d9ee3d31cd2f05ed878a50b
---
M scripts/noreferences.py
M scripts/reflinks.py
2 files changed, 45 insertions(+), 43 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/noreferences.py b/scripts/noreferences.py
index 8e93134..253fdbc 100755
--- a/scripts/noreferences.py
+++ b/scripts/noreferences.py
@@ -496,7 +496,7 @@
def __init__(self, generator, **kwargs) -> None:
"""Initializer."""
- self.availableOptions.update({
+ self.available_options.update({
'verbose': True,
})
super().__init__(**kwargs)
@@ -524,24 +524,24 @@
oldTextCleaned = textlib.removeDisabledParts(text)
if self.referencesR.search(oldTextCleaned) or \
self.referencesTagR.search(oldTextCleaned):
- if self.getOption('verbose'):
+ if self.opt.verbose:
pywikibot.output('No changes necessary: references tag found.')
return False
if self.referencesTemplates:
templateR = '{{(' + '|'.join(self.referencesTemplates) + ')'
if re.search(templateR, oldTextCleaned, re.IGNORECASE):
- if self.getOption('verbose'):
+ if self.opt.verbose:
pywikibot.output(
'No changes necessary: references template found.')
return False
if not self.refR.search(oldTextCleaned):
- if self.getOption('verbose'):
+ if self.opt.verbose:
pywikibot.output('No changes necessary: no ref tags found.')
return False
- if self.getOption('verbose'):
+ if self.opt.verbose:
pywikibot.output('Found ref without references.')
return True
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 76ae2ff..da863b7 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -57,7 +57,8 @@
import pywikibot
-from pywikibot import comms, i18n, pagegenerators, textlib, Bot
+from pywikibot import comms, i18n, pagegenerators, textlib
+from pywikibot.bot import SingleSiteBot
from pywikibot import config2 as config
from pywikibot.pagegenerators import (
XMLDumpPageGenerator as _XMLDumpPageGenerator,
@@ -350,7 +351,7 @@
id += 1
for (g, d) in found_refs.items():
if g:
- group = 'group=\"{0}\" '.format(group)
+ group = 'group=\"{}\" '.format(group)
else:
group = ''
@@ -359,11 +360,11 @@
continue
name = v[0]
if not name:
- name = '"{0}{1}"'.format(self.autogen, id)
+ name = '"{}{}"'.format(self.autogen, id)
id += 1
elif v[2]:
- name = '"{0}"'.format(name)
- named = '<ref {0}name={1}>{2}</ref>'.format(group, name, k)
+ name = '{!r}'.format(name)
+ named = '<ref {}name={}>{}</ref>'.format(group, name, k)
text = text.replace(v[1][0], named, 1)
# make sure that the first (named ref) is not
@@ -372,7 +373,7 @@
header = text[:pos]
end = text[pos:]
- unnamed = '<ref {0}name={1} />'.format(group, name)
+ unnamed = '<ref {}name={} />'.format(group, name)
for ref in v[1][1:]:
end = end.replace(ref, unnamed)
text = header + end
@@ -381,7 +382,7 @@
# TODO : Support ref groups
name = v[0]
if v[1]:
- name = '"{0}"'.format(name)
+ name = '{!r}'.format(name)
text = re.sub(
'<ref name\\s*=\\s*(?P<quote>"?)\\s*{}\\s*(?P=quote)\\s*/>'
.format(k),
@@ -389,21 +390,19 @@
return text
-class ReferencesRobot(Bot):
+class ReferencesRobot(SingleSiteBot):
"""References bot."""
- def __init__(self, generator, **kwargs):
+ def __init__(self, **kwargs):
"""- generator : Page generator."""
- self.availableOptions.update({
+ self.available_options.update({
'ignorepdf': False, # boolean
- 'limit': None, # int, stop after n modified pages
- 'summary': None,
+ 'limit': 0, # int, stop after n modified pages
+ 'summary': '',
})
super().__init__(**kwargs)
- self.generator = generator
- self.site = pywikibot.Site()
self._use_fake_user_agent = config.fake_user_agent_default.get(
'reflinks', False)
# Check
@@ -414,11 +413,11 @@
code = alt
break
if code:
- manual += '/{0}'.format(code)
- if self.getOption('summary') is None:
+ manual += '/{}'.format(code)
+ if self.opt.summary is None:
self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())
else:
- self.msg = self.getOption('summary')
+ self.msg = self.opt.summary
local = i18n.translate(self.site, badtitles)
if local:
@@ -435,7 +434,7 @@
if self.stop_page.exists():
self.stop_page_rev_id = self.stop_page.latest_revision_id
else:
- pywikibot.warning('The stop page {0} does not exist'
+ pywikibot.warning('The stop page {} does not exist'
.format(self.stop_page.title(as_link=True)))
# Regex to grasp content-type meta HTML tag in HTML source
@@ -493,10 +492,11 @@
urlobj.close()
os.unlink(infile)
- def run(self):
- """Run the Bot."""
+ def setup(self):
+ """Read dead links from file."""
try:
- dead_links = codecs.open(listof404pages, 'r', 'latin_1').read()
+ self.dead_links = codecs.open(
+ listof404pages, 'r', 'latin_1').read()
except IOError:
raise NotImplementedError(
'404-links.txt is required for reflinks.py\n'
@@ -504,6 +504,8 @@
'http://www.twoevils.org/files/wikipedia/404-links.txt.gz\n'
'and to unzip it in the same directory')
+ def run(self):
+ """Run the Bot."""
editedpages = 0
for page in self.generator:
try:
@@ -543,7 +545,7 @@
content_type = f.response_headers.get('content-type')
if content_type and not self.MIME.search(content_type):
if ref.link.lower().endswith('.pdf') and \
- not self.getOption('ignorepdf'):
+ not self.opt.ignorepdf:
# If file has a PDF suffix
self.getPDFTitle(ref, f)
else:
@@ -584,15 +586,15 @@
continue
if f.status != codes.ok:
- pywikibot.output('HTTP error ({0}) for {1} on {2}'
+ pywikibot.output('HTTP error ({}) for {} on {}'
.format(f.status, ref.url,
page.title(as_link=True)),
toStdout=True)
# 410 Gone, indicates that the resource has been
# purposely removed
- if f.status == 410 or \
- (f.status == 404 and ('\t{}\t'.format(ref.url)
- in dead_links)):
+ if f.status == 410 \
+ or (f.status == 404 and ('\t{}\t'.format(ref.url)
+ in self.dead_links)):
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
continue
@@ -612,7 +614,7 @@
httplib.error,
pywikibot.FatalServerError,
pywikibot.Server504Error) as e:
- pywikibot.output("Can't retrieve page {0} : {1}"
+ pywikibot.output("Can't retrieve page {} : {}"
.format(ref.url, e))
continue
@@ -626,15 +628,15 @@
# use charset from http header
s = self.CHARSET.search(content_type)
if meta_content:
- tag = meta_content.group()
+ tag = meta_content.group().decode()
# Prefer the contentType from the HTTP header :
if not content_type:
content_type = tag
if not s:
# use charset from html
- s = self.CHARSET.search(str(tag))
+ s = self.CHARSET.search(tag)
if s:
- tmp = s.group('enc').strip("\"' ").lower()
+ tmp = s.group('enc').strip('"\' ').lower()
naked = re.sub(r'[ _\-]', '', tmp)
# Convert to python correct encoding names
if naked == 'gb2312':
@@ -648,10 +650,12 @@
enc.append(tmp)
else:
pywikibot.output('No charset found for ' + ref.link)
+
if not content_type:
pywikibot.output('No content-type found for ' + ref.link)
continue
- elif not self.MIME.search(content_type):
+
+ if not self.MIME.search(content_type):
pywikibot.output(color_format(
'{lightyellow}WARNING{default} : media : {0} ',
ref.link))
@@ -696,15 +700,14 @@
if not ref.title:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
- pywikibot.output('{0} : No title found...'
- .format(ref.link))
+ pywikibot.output('{} : No title found...'.format(ref.link))
continue
# XXX Ugly hack
if 'é' in ref.title:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
- pywikibot.output('{0} : Hybrid encoding...'
+ pywikibot.output('{} : Hybrid encoding...'
.format(ref.link))
continue
@@ -740,10 +743,9 @@
else:
editedpages += 1
- if self.getOption('limit') \
- and editedpages >= self.getOption('limit'):
+ if self.opt.limit and editedpages >= self.opt.limit:
pywikibot.output('Edited {} pages, stopping.'
- .format(self.getOption('limit')))
+ .format(self.opt.limit))
return
if self.site_stop_page and editedpages % 20 == 0:
@@ -810,7 +812,7 @@
if not gen_factory.nopreload:
generator = pagegenerators.PreloadingGenerator(generator)
generator = pagegenerators.RedirectFilterPageGenerator(generator)
- bot = ReferencesRobot(generator, **options)
+ bot = ReferencesRobot(generator=generator, **options)
bot.run()
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/632195
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ib076abaa58a963634d9ee3d31cd2f05ed878a50b
Gerrit-Change-Number: 632195
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: D3r1ck01 <[email protected]>
Gerrit-Reviewer: Rubin <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits