Strainu has submitted this change and it was merged.
Change subject: Ported reflinks.py to core
......................................................................
Ported reflinks.py to core
Bug: 56900
Change-Id: I7355849f997c8615fac2e77df6d9cbfc9c5d1e19
---
M pywikibot/__init__.py
M pywikibot/page.py
A scripts/reflinks.py
3 files changed, 868 insertions(+), 1 deletion(-)
Approvals:
Strainu: Verified; Looks good to me, approved
jenkins-bot: Checked
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index 8f53758..13b3264 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -311,7 +311,7 @@
from page import Page, ImagePage, Category, Link, User, ItemPage,
PropertyPage, Claim
-from page import html2unicode, url2unicode
+from page import html2unicode, url2unicode, unicode2html
link_regex = re.compile(r'\[\[(?P<title>[^\]|[<>{}]*)(\|.*?)?\]\]')
diff --git a/pywikibot/page.py b/pywikibot/page.py
index 9307f86..a533304 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -3486,6 +3486,22 @@
return result
+def unicode2html(x, encoding):
+ """
+Ensure unicode string is encodable, or else convert to ASCII for HTML.
+
+Arguments are a unicode string and an encoding. Attempt to encode the
+string into the desired format; if that doesn't work, encode the unicode
+into html &#; entities. If it does work, return it unchanged.
+
+"""
+ try:
+ x.encode(encoding)
+ except UnicodeError:
+ x = UnicodeToAsciiHtml(x)
+ return x
+
+
def url2unicode(title, site, site2=None):
"""Convert url-encoded text to unicode using site's encoding.
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
new file mode 100644
index 0000000..6a024cd
--- /dev/null
+++ b/scripts/reflinks.py
@@ -0,0 +1,851 @@
+# -*- coding: utf-8 -*-
+"""
+This bot will search for references which are only made of a link without
title,
+(i.e. <ref>[http://www.google.fr/]</ref> or <ref>http://www.google.fr/</ref>)
+and will fetch the html title from the link to use it as the title of the wiki
+link in the reference, i.e.
+<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>
+
+The bot checks every 20 edits a special stop page : if the page has been
edited,
+it stops.
+
+DumZiBoT is running that script on en: & fr: at every new dump, running it on
+de: is not allowed anymore.
+
+As it uses it, you need to configure noreferences.py for your wiki, or it will
+not work.
+
+pdfinfo is needed for parsing pdf titles.
+
+See [[:en:User:DumZiBoT/refLinks]] for more information on the bot.
+
+¶ms;
+
+-limit:n Stops after n edits
+
+-xml:dump.xml Should be used instead of a simple page fetching method from
+ pagegenerators.py for performance and load issues
+
+-xmlstart Page to start with when using an XML dump
+
+-ignorepdf Do not handle PDF files (handy if you use Windows and can't
+ get pdfinfo)
+"""
+# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
+# (C) Pywikipedia bot team, 2008-2013
+#
+# Distributed under the terms of the GPL
+#
+__version__ = '$Id$'
+#
+
+import sys
+import re
+import urllib2
+import httplib
+import socket
+import codecs
+import ftplib
+import subprocess
+import tempfile
+import os
+import gzip
+import StringIO
+
+import pywikibot
+from pywikibot import pagegenerators
+import noreferences
+from pywikibot import i18n
+
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp
+}
+
+localized_msg = ('fr', ) # localized message at mediawik
+
+# localized message at specific wikipedia site
+# should be moved to mediawiki pywikibot manual
+L10N_msg = {
+ 'it': u'Utente:Marco27Bot/refLinks.py',
+ 'pl': u'Wikipedysta:MastiBot/refLinks',
+}
+
+
+stopPage = {
+ 'fr': u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
+ 'da': u'Bruger:DumZiBoT/EditThisPageToStopMe',
+ 'de': u'Benutzer:DumZiBoT/EditThisPageToStopMe',
+ 'fa': u'کاربر:Amirobot/EditThisPageToStopMe',
+ 'it': u'Utente:Marco27Bot/EditThisPageToStopMe',
+ 'ko': u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
+ 'he': u'User:Matanyabot/EditThisPageToStopMe',
+ 'hu': u'User:Damibot/EditThisPageToStopMe',
+ 'en': u'User:DumZiBoT/EditThisPageToStopMe',
+ 'pl': u'Wikipedysta:MastiBot/EditThisPageToStopMe',
+ 'ru': u'User:Rubinbot/EditThisPageToStopMe',
+ 'zh': u'User:Sz-iwbot',
+}
+
+deadLinkTag = {
+ 'fr': u'[%s] {{lien mort}}',
+ 'da': u'[%s] {{dødt link}}',
+ 'de': u'',
+ 'fa': u'[%s] {{پیوند مرده}}',
+ 'he': u'{{קישור שבור}}',
+ 'hu': u'[%s] {{halott link}}',
+ 'ko': u'[%s] {{죽은 바깥 고리}}',
+ 'es': u'{{enlace roto2|%s}}',
+ 'it': u'{{Collegamento interrotto|%s}}',
+ 'en': u'[%s] {{dead link}}',
+ 'pl': u'[%s] {{Martwy link}}',
+ 'ru': u'[%s] {{subst:dead}}',
+}
+
+
+soft404 = re.compile(
+ ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
+ re.IGNORECASE)
+# matches an URL at the index of a website
+dirIndex = re.compile(
+
ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$',
+ re.IGNORECASE)
+# Extracts the domain name
+domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
+
+globalbadtitles = """
+# is
+(test|
+# starts with
+ ^\W*(
+ register
+ |registration
+ |(sign|log)[ \-]?in
+ |subscribe
+ |sign[ \-]?up
+ |log[ \-]?on
+ |untitled[ ]?(document|page|\d+|$)
+ |404[ ]
+ ).*
+# anywhere
+ |.*(
+ 403[ ]forbidden
+ |(404|page|file|information|resource).*not([ ]*be)?[
]*(available|found)
+ |site.*disabled
+ |error[ ]404
+ |error.+not[ ]found
+ |not[ ]found.+error
+ |404[ ]error
+ |\D404\D
+ |check[ ]browser[ ]settings
+ |log[ \-]?(on|in)[ ]to
+ |site[ ]redirection
+ ).*
+# ends with
+ |.*(
+ register
+ |registration
+ |(sign|log)[ \-]?in
+ |subscribe|sign[ \-]?up
+ |log[ \-]?on
+ )\W*$
+)
+"""
+# Language-specific bad titles
+badtitles = {
+ 'en': '',
+ 'fr': '.*(404|page|site).*en +travaux.*',
+ 'es': '.*sitio.*no +disponible.*',
+ 'it': '((pagina|sito) (non trovata|inesistente)|accedi)',
+ 'ru': u'.*(Страница|страница).*(не[ ]*найдена|осутствует).*',
+}
+
+# Regex that match bare references
+linksInRef = re.compile(
+ # bracketed URLs
+ ur'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:' +
+ # unbracketed with()
+ ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|' +
+ # unbracketed without ()
+
ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>')
+
+# Download this file :
+# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
+# ( maintained by User:Dispenser )
+listof404pages = '404-links.txt'
+
+
+class XmlDumpPageGenerator:
+ """Xml generator that yiels pages containing bare references"""
+
+ def __init__(self, xmlFilename, xmlStart, namespaces):
+ self.xmlStart = xmlStart
+ self.namespaces = namespaces
+ self.skipping = bool(xmlStart)
+ self.site = pywikibot.getSite()
+
+ import xmlreader
+ dump = xmlreader.XmlDump(xmlFilename)
+ self.parser = dump.parse()
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ while True:
+ try:
+ entry = self.parser.next()
+ except StopIteration:
+ raise
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ page = pywikibot.Page(self.site, entry.title)
+ if not self.namespaces == []:
+ if page.namespace() not in self.namespaces:
+ continue
+ if linksInRef.search(entry.text):
+ return page
+
+
+class RefLink:
+ """Container to handle a single bare reference"""
+
+ def __init__(self, link, name):
+ self.refname = name
+ self.link = link
+ self.site = pywikibot.getSite()
+ self.linkComment = i18n.twtranslate(self.site, 'reflinks-comment')
+ self.url = re.sub(u'#.*', '', self.link)
+ self.title = None
+
+ def refTitle(self):
+ """Returns the <ref> with its new title"""
+ return '<ref%s>[%s %s<!-- %s -->]</ref>' % (self.refname, self.link,
+ self.title,
+ self.linkComment)
+
+ def refLink(self):
+ """No title has been found, return the unbracketed link"""
+ return '<ref%s>%s</ref>' % (self.refname, self.link)
+
+ def refDead(self):
+ """Dead link, tag it with a {{dead link}}"""
+ tag = pywikibot.translate(self.site, deadLinkTag) % self.link
+ return '<ref%s>%s</ref>' % (self.refname, tag)
+
+ def transform(self, ispdf=False):
+ """Normalize the title"""
+ #convert html entities
+ if not ispdf:
+ self.title = pywikibot.html2unicode(self.title)
+ self.title = re.sub(r'-+', '-', self.title)
+ #remove formatting, i.e long useless strings
+ self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
+ #remove \n and \r and Unicode spaces from titles
+ self.title = re.sub(r'(?u)\s', ' ', self.title)
+ self.title = re.sub(r'[\n\r\t]', ' ', self.title)
+ #remove extra whitespaces
+ #remove leading and trailing ./;/,/-/_/+/ /
+ self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))
+
+ self.avoid_uppercase()
+ #avoid closing the link before the end
+ self.title = self.title.replace(']', ']')
+ #avoid multiple } being interpreted as a template inclusion
+ self.title = self.title.replace('}}', '}}')
+ #prevent multiple quotes being interpreted as '' or '''
+ self.title = self.title.replace('\'\'', '\''')
+ self.title = pywikibot.unicode2html(self.title, self.site.encoding())
+ # TODO : remove HTML when both opening and closing tags are included
+
+ def avoid_uppercase(self):
+ """ If title has more than 6 characters and has 60% of uppercase
+ characters, capitalize() it
+
+ """
+ if len(self.title) <= 6:
+ return
+ nb_upper = 0
+ nb_letter = 0
+ for letter in self.title:
+ if letter.isupper():
+ nb_upper += 1
+ if letter.isalpha():
+ nb_letter += 1
+ if letter.isdigit():
+ return
+ if float(nb_upper) / (nb_letter + 1) > .70:
+ self.title = self.title.title()
+
+
+class DuplicateReferences:
+ """ When some references are duplicated in an article,
+ name the first, and remove the content of the others
+
+ """
+ def __init__(self):
+ # Match references
+ self.REFS = re.compile(
+ u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
+ self.NAMES = re.compile(
+ u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
+ self.GROUPS = re.compile(
+ u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
+ self.autogen = i18n.twtranslate(pywikibot.getSite(),
'reflinks-autogen')
+
+ def process(self, text):
+ # keys are ref groups
+ # values are a dict where :
+ # keys are ref content
+ # values are [name, [list of full ref matches],
+ # quoted, need_to_change]
+ foundRefs = {}
+ foundRefNames = {}
+ # Replace key by [value, quoted]
+ namedRepl = {}
+
+ for match in self.REFS.finditer(text):
+ content = match.group('content')
+ if not content.strip():
+ continue
+
+ params = match.group('params')
+ group = self.GROUPS.match(params)
+ if not group in foundRefs:
+ foundRefs[group] = {}
+
+ groupdict = foundRefs[group]
+ if content in groupdict:
+ v = groupdict[content]
+ v[1].append(match.group())
+ else:
+ v = [None, [match.group()], False, False]
+ name = self.NAMES.match(params)
+ if name:
+ quoted = name.group('quote') == '"'
+ name = name.group('name')
+ if v[0]:
+ if v[0] != name:
+ namedRepl[name] = [v[0], v[2]]
+ else:
+ # First name associated with this content
+
+ if name == 'population':
+ pywikibot.output(content)
+ if not name in foundRefNames:
+ # first time ever we meet this name
+ if name == 'population':
+ print "in"
+ v[2] = quoted
+ v[0] = name
+ else:
+ # if has_key, means that this name is used
+ # with another content. We'll need to change it
+ v[3] = True
+ foundRefNames[name] = 1
+ groupdict[content] = v
+
+ id = 1
+ while self.autogen + str(id) in foundRefNames:
+ id += 1
+ for (g, d) in foundRefs.iteritems():
+ if g:
+ group = u"group=\"%s\" " % group
+ else:
+ group = u""
+
+ for (k, v) in d.iteritems():
+ if len(v[1]) == 1 and not v[3]:
+ continue
+ name = v[0]
+ if not name:
+ name = self.autogen + str(id)
+ id += 1
+ elif v[2]:
+ name = u'"%s"' % name
+ named = u'<ref %sname=%s>%s</ref>' % (group, name, k)
+ text = text.replace(v[1][0], named, 1)
+
+ # make sure that the first (named ref) is not
+ # removed later :
+ pos = text.index(named) + len(named)
+ header = text[:pos]
+ end = text[pos:]
+
+ unnamed = u'<ref %sname=%s />' % (group, name)
+ for ref in v[1][1:]:
+ end = end.replace(ref, unnamed)
+ text = header + end
+
+ for (k, v) in namedRepl.iteritems():
+ # TODO : Support ref groups
+ name = v[0]
+ if v[1]:
+ name = u'"%s"' % name
+ text = re.sub(
+ u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k,
+ u'<ref name=%s />' % name, text)
+ return text
+
+
+class ReferencesRobot:
+
+ def __init__(self, generator, acceptall=False, limit=None,
ignorepdf=False):
+ """
+ - generator : Page generator
+ - acceptall : boolean, is -always on ?
+ - limit : int, stop after n modified pages
+ - ignorepdf : boolean
+
+ """
+ self.generator = generator
+ self.acceptall = acceptall
+ self.limit = limit
+ self.ignorepdf = ignorepdf
+ self.site = pywikibot.getSite()
+ # Check
+ manual = 'mw:Manual:Pywikibot/refLinks'
+ if self.site.family.name == 'wikipedia':
+ manual = pywikibot.translate(self.site.code, manual)
+ else:
+ code = None
+ for alt in [self.site.code] + i18n._altlang(self.site.code):
+ if alt in localized_msg:
+ code = alt
+ break
+ if code:
+ manual += '/%s' % code
+ self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())
+ self.stopPage = pywikibot.Page(self.site,
+ pywikibot.translate(self.site,
stopPage))
+
+ local = pywikibot.translate(self.site, badtitles)
+ if local:
+ bad = '(' + globalbadtitles + '|' + local + ')'
+ else:
+ bad = globalbadtitles
+ self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
+ self.norefbot = noreferences.NoReferencesBot(None)
+ self.deduplicator = DuplicateReferences()
+ try:
+ self.stopPageRevId = self.stopPage.latestRevision()
+ except pywikibot.NoPage:
+ pywikibot.output(u'The stop page %s does not exist'
+ % self.stopPage.title(asLink=True))
+ raise
+
+ # Regex to grasp content-type meta HTML tag in HTML source
+ self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
+ # Extract the encoding from a charset property (from content-type !)
+ self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)')
+ # Extract html title from page
+ self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
+ # Matches content inside <script>/<style>/HTML comments
+ self.NON_HTML = re.compile(
+
ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
+
+ # Authorized mime types for HTML pages
+ self.MIME = re.compile(
+ ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
+
+ def put_page(self, page, new):
+ """ Prints diffs between orginal and new (text), puts new text for page
+
+ """
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+ % page.title())
+ pywikibot.showDiff(page.get(), new)
+ if not self.acceptall:
+ choice = pywikibot.inputChoice(u'Do you want to accept ' +
+ u'these changes?',
+ ['Yes', 'No', 'All'],
+ ['y', 'N', 'a'], 'N')
+ if choice == 'a':
+ self.acceptall = True
+ if choice == 'y':
+ page.put_async(new, self.msg)
+ if self.acceptall:
+ try:
+ page.put(new, self.msg)
+ except pywikibot.EditConflict:
+ pywikibot.output(u'Skipping %s because of edit conflict'
+ % (page.title(),))
+ except pywikibot.SpamfilterError, e:
+ pywikibot.output(
+ u'Cannot change %s because of blacklist entry %s'
+ % (page.title(), e.url))
+ except pywikibot.PageNotSaved, error:
+ pywikibot.error(u'putting page: %s' % (error.args,))
+ except pywikibot.LockedPage:
+ pywikibot.output(u'Skipping %s (locked page)'
+ % (page.title(),))
+ except pywikibot.ServerError, e:
+ pywikibot.output(u'Server Error : %s' % e)
+
+ def httpError(self, err_num, link, pagetitleaslink):
+ """Log HTTP Error"""
+ pywikibot.output(u'HTTP error (%s) for %s on %s'
+ % (err_num, link, pagetitleaslink), toStdout=True)
+
+ def getPDFTitle(self, ref, f):
+ """ Use pdfinfo to retrieve title from a PDF.
+ Unix-only, I'm afraid.
+
+ """
+ pywikibot.output(u'PDF file.')
+ fd, infile = tempfile.mkstemp()
+ urlobj = os.fdopen(fd, 'r+w')
+ urlobj.write(f.read())
+ try:
+ pdfinfo_out = subprocess.Popen([r"pdfinfo", "/dev/stdin"],
+ stdin=urlobj,
stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=False).communicate()[0]
+ for aline in pdfinfo_out.splitlines():
+ if aline.lower().startswith('title'):
+ ref.title = aline.split(None)[1:]
+ ref.title = ' '.join(ref.title)
+ if ref.title != '':
+ pywikibot.output(u'title: %s' % ref.title)
+ pywikibot.output(u'PDF done.')
+ except ValueError:
+ pywikibot.output(u'pdfinfo value error.')
+ except OSError:
+ pywikibot.output(u'pdfinfo OS error.')
+ except: # Ignore errors
+ pywikibot.output(u'PDF processing error.')
+ pass
+ finally:
+ urlobj.close()
+ os.unlink(infile)
+
+ def run(self):
+ """ Runs the Bot """
+ try:
+ deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
+ except IOError:
+ pywikibot.output(
+ 'You need to download '
+ 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
+ 'and to ungzip it in the same directory')
+ raise
+ socket.setdefaulttimeout(30)
+ editedpages = 0
+ for page in self.generator:
+ try:
+ # Load the page's text from the wiki
+ new_text = page.get()
+ if not page.canBeEdited():
+ pywikibot.output(u"You can't edit page %s"
+ % page.title(asLink=True))
+ continue
+ except pywikibot.NoPage:
+ pywikibot.output(u'Page %s not found' %
page.title(asLink=True))
+ continue
+ except pywikibot.IsRedirectPage:
+ pywikibot.output(u'Page %s is a redirect'
+ % page.title(asLink=True))
+ continue
+
+ # for each link to change
+ for match in linksInRef.finditer(
+ pywikibot.removeDisabledParts(page.get())):
+
+ link = match.group(u'url')
+ #debugging purpose
+ #print link
+ if u'jstor.org' in link:
+ #TODO: Clean URL blacklist
+ continue
+
+ ref = RefLink(link, match.group('name'))
+ f = None
+ try:
+ socket.setdefaulttimeout(20)
+ try:
+ f = urllib2.urlopen(ref.url.decode("utf8"))
+ except UnicodeError:
+ ref.url = urllib2.quote(ref.url.encode("utf8"), "://")
+ f = urllib2.urlopen(ref.url)
+ #Try to get Content-Type from server
+ headers = f.info()
+ contentType = headers.getheader('Content-Type')
+ if contentType and not self.MIME.search(contentType):
+ if ref.link.lower().endswith('.pdf') and \
+ not self.ignorepdf:
+ # If file has a PDF suffix
+ self.getPDFTitle(ref, f)
+ else:
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : '
+ u'media : %s ' % ref.link)
+ if ref.title:
+ if not re.match(
+ u'(?i) *microsoft (word|excel|visio)',
+ ref.title):
+ ref.transform(ispdf=True)
+ repl = ref.refTitle()
+ else:
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : '
+ u'PDF title blacklisted : %s ' % ref.title)
+ repl = ref.refLink()
+ else:
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ continue
+ # Get the real url where we end (http redirects !)
+ redir = f.geturl()
+ if redir != ref.link and \
+ domain.findall(redir) == domain.findall(link):
+ if soft404.search(redir) and \
+ not soft404.search(ref.link):
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : '
+ u'Redirect 404 : %s ' % ref.link)
+ continue
+ if dirIndex.match(redir) and \
+ not dirIndex.match(ref.link):
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : '
+ u'Redirect to root : %s ' % ref.link)
+ continue
+
+ # uncompress if necessary
+ if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
+ # XXX: small issue here: the whole page is downloaded
+ # through f.read(). It might fetch big files/pages.
+ # However, truncating an encoded gzipped stream is not
+ # an option, for unzipping will fail.
+ compressed = StringIO.StringIO(f.read())
+ f = gzip.GzipFile(fileobj=compressed)
+
+ # Read the first 1,000,000 bytes (0.95 MB)
+ linkedpagetext = f.read(1000000)
+ socket.setdefaulttimeout(None)
+
+ except UnicodeError:
+ # example :
http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
+ # in [[fr:Cyanure]]
+ pywikibot.output(
+ u'\03{lightred}Bad link\03{default} : %s in %s'
+ % (ref.url, page.title(asLink=True)))
+ continue
+ except urllib2.HTTPError, e:
+ pywikibot.output(u'HTTP error (%s) for %s on %s'
+ % (e.code, ref.url,
+ page.title(asLink=True)),
+ toStdout=True)
+ # 410 Gone, indicates that the resource has been purposely
+ # removed
+ if e.code == 410 or \
+ (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
+ repl = ref.refDead()
+ new_text = new_text.replace(match.group(), repl)
+ continue
+ except (urllib2.URLError,
+ socket.error,
+ IOError,
+ httplib.error), e:
+ pywikibot.output(u'Can\'t retrieve page %s : %s'
+ % (ref.url, e))
+ continue
+ except ValueError:
+ # Known bug of httplib, google for :
+ # "httplib raises ValueError reading chunked content"
+ continue
+ finally:
+ if f:
+ f.close()
+
+ #remove <script>/<style>/comments/CDATA tags
+ linkedpagetext = self.NON_HTML.sub('', linkedpagetext)
+
+ meta_content = self.META_CONTENT.search(linkedpagetext)
+ enc = []
+ s = None
+ if contentType:
+ # use charset from http header
+ s = self.CHARSET.search(contentType)
+ if meta_content:
+ tag = meta_content.group()
+ # Prefer the contentType from the HTTP header :
+ if not contentType:
+ contentType = tag
+ if not s:
+ # use charset from html
+ s = self.CHARSET.search(tag)
+ if s:
+ tmp = s.group('enc').strip("\"' ").lower()
+ naked = re.sub('[ _\-]', '', tmp)
+ # Convert to python correct encoding names
+ if naked == "gb2312":
+ enc.append("gbk")
+ elif naked == "shiftjis":
+ enc.append("shift jis 2004")
+ enc.append("cp932")
+ elif naked == "xeucjp":
+ enc.append("euc-jp")
+ else:
+ enc.append(tmp)
+ else:
+ pywikibot.output(u'No charset found for %s' % ref.link)
+## continue # do not process pages without charset
+ if not contentType:
+ pywikibot.output(u'No content-type found for %s' %
ref.link)
+ continue
+ elif not self.MIME.search(contentType):
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : media : %s '
+ % ref.link)
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ continue
+
+ # Ugly hacks to try to survive when both server and page
+ # return no encoding.
+ # Uses most used encodings for each national suffix
+ if u'.ru' in ref.link or u'.su' in ref.link:
+ # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
+ # encoding, no page encoding
+ enc = enc + ['koi8-r', 'windows-1251']
+ elif u'.jp' in ref.link:
+ enc.append("shift jis 2004")
+ enc.append("cp932")
+ elif u'.kr' in ref.link:
+ enc.append("euc-kr")
+ enc.append("cp949")
+ elif u'.zh' in ref.link:
+ enc.append("gbk")
+
+ u = linkedpagetext
+
+ # Retrieves the first non empty string inside <title> tags
+ for m in self.TITLE.finditer(u):
+ t = m.group()
+ if t:
+ ref.title = t
+ ref.transform()
+ if ref.title:
+ break
+
+ if not ref.title:
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ pywikibot.output(u'%s : No title found...' % ref.link)
+ continue
+
+ # XXX Ugly hack
+ if u'é' in ref.title:
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ pywikibot.output(u'%s : Hybrid encoding...' % ref.link)
+ continue
+
+ if self.titleBlackList.match(ref.title):
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ pywikibot.output(u'\03{lightred}WARNING\03{default} %s : '
+ u'Blacklisted title (%s)'
+ % (ref.link, ref.title))
+ continue
+
+ # Truncate long titles. 175 is arbitrary
+ if len(ref.title) > 175:
+ ref.title = ref.title[:175] + "..."
+
+ repl = ref.refTitle()
+ new_text = new_text.replace(match.group(), repl)
+
+ # Add <references/> when needed, but ignore templates !
+ if page.namespace != 10:
+ if self.norefbot.lacksReferences(new_text, verbose=False):
+ new_text = self.norefbot.addReferences(new_text)
+
+ new_text = self.deduplicator.process(new_text)
+
+ if new_text == page.get():
+ pywikibot.output('No changes were necessary in %s'
+ % page.title(asLink=True))
+ continue
+
+ editedpages += 1
+ self.put_page(page, new_text)
+
+ if self.limit and editedpages >= self.limit:
+ pywikibot.output('Edited %s pages, stopping.' % self.limit)
+ return
+
+ if editedpages % 20 == 0:
+ pywikibot.output(
+ '\03{lightgreen}Checking stop page...\03{default}')
+ actualRev = self.stopPage.latestRevision()
+ if actualRev != self.stopPageRevId:
+ pywikibot.output(
+ u'[[%s]] has been edited : Someone wants us to stop.'
+ % self.stopPage)
+ return
+
+
+def main():
+ genFactory = pagegenerators.GeneratorFactory()
+
+ PageTitles = []
+ xmlFilename = None
+ always = False
+ ignorepdf = False
+ limit = None
+ namespaces = []
+ generator = None
+ for arg in pywikibot.handleArgs():
+ if arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[11:]))
+ except ValueError:
+ namespaces.append(arg[11:])
+ elif arg.startswith('-summary:'):
+ pywikibot.setAction(arg[9:])
+ elif arg == '-always':
+ always = True
+ elif arg == '-ignorepdf':
+ ignorepdf = True
+ elif arg.startswith('-limit:'):
+ limit = int(arg[7:])
+ elif arg.startswith('-xmlstart'):
+ if len(arg) == 9:
+ xmlStart = pywikibot.input(
+ u'Please enter the dumped article to start with:')
+ else:
+ xmlStart = arg[10:]
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = pywikibot.input(
+ u'Please enter the XML dump\'s filename:')
+ else:
+ xmlFilename = arg[5:]
+ else:
+ genFactory.handleArg(arg)
+
+ if xmlFilename:
+ try:
+ xmlStart
+ except NameError:
+ xmlStart = None
+ generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+ if not generator:
+ generator = genFactory.getCombinedGenerator()
+ if not generator:
+ # syntax error, show help text from the top of this file
+ pywikibot.showHelp('reflinks')
+ return
+ generator = pagegenerators.PreloadingGenerator(generator, pageNumber=50)
+ generator = pagegenerators.RedirectFilterPageGenerator(generator)
+ bot = ReferencesRobot(generator, always, limit, ignorepdf)
+ bot.run()
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ pywikibot.stopme()
--
To view, visit https://gerrit.wikimedia.org/r/96976
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7355849f997c8615fac2e77df6d9cbfc9c5d1e19
Gerrit-PatchSet: 11
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mayankmadan <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Legoktm <[email protected]>
Gerrit-Reviewer: Mayankmadan <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Strainu <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits