[MediaWiki-commits] [Gerrit] Ported reflinks.py to core - change (pywikibot/core)

Strainu (Code Review) Mon, 25 Nov 2013 05:51:48 -0800

Strainu has submitted this change and it was merged.

Change subject: Ported reflinks.py to core
......................................................................



Ported reflinks.py to core

Bug: 56900
Change-Id: I7355849f997c8615fac2e77df6d9cbfc9c5d1e19
---
M pywikibot/__init__.py
M pywikibot/page.py
A scripts/reflinks.py
3 files changed, 868 insertions(+), 1 deletion(-)

Approvals:
  Strainu: Verified; Looks good to me, approved
  jenkins-bot: Checked



diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index 8f53758..13b3264 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -311,7 +311,7 @@
 
 
 from page import Page, ImagePage, Category, Link, User, ItemPage, 
PropertyPage, Claim
-from page import html2unicode, url2unicode
+from page import html2unicode, url2unicode, unicode2html
 
 
 link_regex = re.compile(r'\[\[(?P<title>[^\]|[<>{}]*)(\|.*?)?\]\]')
diff --git a/pywikibot/page.py b/pywikibot/page.py
index 9307f86..a533304 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -3486,6 +3486,22 @@
     return result
 
 
+def unicode2html(x, encoding):
+    """
+Ensure unicode string is encodable, or else convert to ASCII for HTML.
+
+Arguments are a unicode string and an encoding. Attempt to encode the
+string into the desired format; if that doesn't work, encode the unicode
+into html &#; entities. If it does work, return it unchanged.
+
+"""
+    try:
+        x.encode(encoding)
+    except UnicodeError:
+        x = UnicodeToAsciiHtml(x)
+    return x
+
+
 def url2unicode(title, site, site2=None):
     """Convert url-encoded text to unicode using site's encoding.
 
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
new file mode 100644
index 0000000..6a024cd
--- /dev/null
+++ b/scripts/reflinks.py
@@ -0,0 +1,851 @@
+# -*- coding: utf-8 -*-
+"""
+This bot will search for references which are only made of a link without 
title,
+(i.e. <ref>[http://www.google.fr/]</ref> or <ref>http://www.google.fr/</ref>)
+and will fetch the html title from the link to use it as the title of the wiki
+link in the reference, i.e.
+<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>
+
+The bot checks every 20 edits a special stop page : if the page has been 
edited,
+it stops.
+
+DumZiBoT is running that script on en: & fr: at every new dump, running it on
+de: is not allowed anymore.
+
+As it uses it, you need to configure noreferences.py for your wiki, or it will
+not work.
+
+pdfinfo is needed for parsing pdf titles.
+
+See [[:en:User:DumZiBoT/refLinks]] for more information on the bot.
+
+&params;
+
+-limit:n          Stops after n edits
+
+-xml:dump.xml     Should be used instead of a simple page fetching method from
+                  pagegenerators.py for performance and load issues
+
+-xmlstart         Page to start with when using an XML dump
+
+-ignorepdf        Do not handle PDF files (handy if you use Windows and can't
+                  get pdfinfo)
+"""
+# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
+# (C) Pywikipedia bot team, 2008-2013
+#
+# Distributed under the terms of the GPL
+#
+__version__ = '$Id$'
+#
+
+import sys
+import re
+import urllib2
+import httplib
+import socket
+import codecs
+import ftplib
+import subprocess
+import tempfile
+import os
+import gzip
+import StringIO
+
+import pywikibot
+from pywikibot import pagegenerators
+import noreferences
+from pywikibot import i18n
+
+docuReplacements = {
+    '&params;': pagegenerators.parameterHelp
+}
+
+localized_msg = ('fr', )  # localized message at mediawik
+
+# localized message at specific wikipedia site
+# should be moved to mediawiki pywikibot manual
+L10N_msg = {
+    'it': u'Utente:Marco27Bot/refLinks.py',
+    'pl': u'Wikipedysta:MastiBot/refLinks',
+}
+
+
+stopPage = {
+    'fr': u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
+    'da': u'Bruger:DumZiBoT/EditThisPageToStopMe',
+    'de': u'Benutzer:DumZiBoT/EditThisPageToStopMe',
+    'fa': u'کاربر:Amirobot/EditThisPageToStopMe',
+    'it': u'Utente:Marco27Bot/EditThisPageToStopMe',
+    'ko': u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
+    'he': u'User:Matanyabot/EditThisPageToStopMe',
+    'hu': u'User:Damibot/EditThisPageToStopMe',
+    'en': u'User:DumZiBoT/EditThisPageToStopMe',
+    'pl': u'Wikipedysta:MastiBot/EditThisPageToStopMe',
+    'ru': u'User:Rubinbot/EditThisPageToStopMe',
+    'zh': u'User:Sz-iwbot',
+}
+
+deadLinkTag = {
+    'fr': u'[%s] {{lien mort}}',
+    'da': u'[%s] {{dødt link}}',
+    'de': u'',
+    'fa': u'[%s] {{پیوند مرده}}',
+    'he': u'{{קישור שבור}}',
+    'hu': u'[%s] {{halott link}}',
+    'ko': u'[%s] {{죽은 바깥 고리}}',
+    'es': u'{{enlace roto2|%s}}',
+    'it': u'{{Collegamento interrotto|%s}}',
+    'en': u'[%s] {{dead link}}',
+    'pl': u'[%s] {{Martwy link}}',
+    'ru': u'[%s] {{subst:dead}}',
+}
+
+
+soft404 = re.compile(
+    ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
+    re.IGNORECASE)
+# matches an URL at the index of a website
+dirIndex = re.compile(
+    
ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$',
+    re.IGNORECASE)
+# Extracts the domain name
+domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
+
+globalbadtitles = """
+# is
+(test|
+# starts with
+    ^\W*(
+            register
+            |registration
+            |(sign|log)[ \-]?in
+            |subscribe
+            |sign[ \-]?up
+            |log[ \-]?on
+            |untitled[ ]?(document|page|\d+|$)
+            |404[ ]
+        ).*
+# anywhere
+    |.*(
+            403[ ]forbidden
+            |(404|page|file|information|resource).*not([ ]*be)?[ 
]*(available|found)
+            |site.*disabled
+            |error[ ]404
+            |error.+not[ ]found
+            |not[ ]found.+error
+            |404[ ]error
+            |\D404\D
+            |check[ ]browser[ ]settings
+            |log[ \-]?(on|in)[ ]to
+            |site[ ]redirection
+     ).*
+# ends with
+    |.*(
+            register
+            |registration
+            |(sign|log)[ \-]?in
+            |subscribe|sign[ \-]?up
+            |log[ \-]?on
+        )\W*$
+)
+"""
+# Language-specific bad titles
+badtitles = {
+    'en': '',
+    'fr': '.*(404|page|site).*en +travaux.*',
+    'es': '.*sitio.*no +disponible.*',
+    'it': '((pagina|sito) (non trovata|inesistente)|accedi)',
+    'ru': u'.*(Страница|страница).*(не[ ]*найдена|осутствует).*',
+}
+
+# Regex that match bare references
+linksInRef = re.compile(
+    # bracketed URLs
+    ur'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:' +
+    # unbracketed with()
+    ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|' +
+    # unbracketed without ()
+    
ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>')
+
+# Download this file :
+# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
+# ( maintained by User:Dispenser )
+listof404pages = '404-links.txt'
+
+
+class XmlDumpPageGenerator:
+    """Xml generator that yiels pages containing bare references"""
+
+    def __init__(self, xmlFilename, xmlStart, namespaces):
+        self.xmlStart = xmlStart
+        self.namespaces = namespaces
+        self.skipping = bool(xmlStart)
+        self.site = pywikibot.getSite()
+
+        import xmlreader
+        dump = xmlreader.XmlDump(xmlFilename)
+        self.parser = dump.parse()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        while True:
+            try:
+                entry = self.parser.next()
+            except StopIteration:
+                raise
+            if self.skipping:
+                if entry.title != self.xmlStart:
+                    continue
+                self.skipping = False
+            page = pywikibot.Page(self.site, entry.title)
+            if not self.namespaces == []:
+                if page.namespace() not in self.namespaces:
+                    continue
+            if linksInRef.search(entry.text):
+                return page
+
+
+class RefLink:
+    """Container to handle a single bare reference"""
+
+    def __init__(self, link, name):
+        self.refname = name
+        self.link = link
+        self.site = pywikibot.getSite()
+        self.linkComment = i18n.twtranslate(self.site, 'reflinks-comment')
+        self.url = re.sub(u'#.*', '', self.link)
+        self.title = None
+
+    def refTitle(self):
+        """Returns the <ref> with its new title"""
+        return '<ref%s>[%s %s<!-- %s -->]</ref>' % (self.refname, self.link,
+                                                    self.title,
+                                                    self.linkComment)
+
+    def refLink(self):
+        """No title has been found, return the unbracketed link"""
+        return '<ref%s>%s</ref>' % (self.refname, self.link)
+
+    def refDead(self):
+        """Dead link, tag it with a {{dead link}}"""
+        tag = pywikibot.translate(self.site, deadLinkTag) % self.link
+        return '<ref%s>%s</ref>' % (self.refname, tag)
+
+    def transform(self, ispdf=False):
+        """Normalize the title"""
+        #convert html entities
+        if not ispdf:
+            self.title = pywikibot.html2unicode(self.title)
+        self.title = re.sub(r'-+', '-', self.title)
+        #remove formatting, i.e long useless strings
+        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
+        #remove \n and \r and Unicode spaces from titles
+        self.title = re.sub(r'(?u)\s', ' ', self.title)
+        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
+        #remove extra whitespaces
+        #remove leading and trailing ./;/,/-/_/+/ /
+        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))
+
+        self.avoid_uppercase()
+        #avoid closing the link before the end
+        self.title = self.title.replace(']', '&#93;')
+        #avoid multiple } being interpreted as a template inclusion
+        self.title = self.title.replace('}}', '}&#125;')
+        #prevent multiple quotes being interpreted as '' or '''
+        self.title = self.title.replace('\'\'', '\'&#39;')
+        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
+        # TODO : remove HTML when both opening and closing tags are included
+
+    def avoid_uppercase(self):
+        """ If title has more than 6 characters and has 60% of uppercase
+        characters, capitalize() it
+
+        """
+        if len(self.title) <= 6:
+            return
+        nb_upper = 0
+        nb_letter = 0
+        for letter in self.title:
+            if letter.isupper():
+                nb_upper += 1
+            if letter.isalpha():
+                nb_letter += 1
+            if letter.isdigit():
+                return
+        if float(nb_upper) / (nb_letter + 1) > .70:
+            self.title = self.title.title()
+
+
+class DuplicateReferences:
+    """ When some references are duplicated in an article,
+    name the first, and remove the content of the others
+
+    """
+    def __init__(self):
+        # Match references
+        self.REFS = re.compile(
+            u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
+        self.NAMES = re.compile(
+            u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
+        self.GROUPS = re.compile(
+            u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
+        self.autogen = i18n.twtranslate(pywikibot.getSite(), 
'reflinks-autogen')
+
+    def process(self, text):
+        # keys are ref groups
+        # values are a dict where :
+        #   keys are ref content
+        #   values are [name, [list of full ref matches],
+        #               quoted, need_to_change]
+        foundRefs = {}
+        foundRefNames = {}
+        # Replace key by [value, quoted]
+        namedRepl = {}
+
+        for match in self.REFS.finditer(text):
+            content = match.group('content')
+            if not content.strip():
+                continue
+
+            params = match.group('params')
+            group = self.GROUPS.match(params)
+            if not group in foundRefs:
+                foundRefs[group] = {}
+
+            groupdict = foundRefs[group]
+            if content in groupdict:
+                v = groupdict[content]
+                v[1].append(match.group())
+            else:
+                v = [None, [match.group()], False, False]
+            name = self.NAMES.match(params)
+            if name:
+                quoted = name.group('quote') == '"'
+                name = name.group('name')
+                if v[0]:
+                    if v[0] != name:
+                        namedRepl[name] = [v[0], v[2]]
+                else:
+                    # First name associated with this content
+
+                    if name == 'population':
+                        pywikibot.output(content)
+                    if not name in foundRefNames:
+                        # first time ever we meet this name
+                        if name == 'population':
+                            print "in"
+                        v[2] = quoted
+                        v[0] = name
+                    else:
+                        # if has_key, means that this name is used
+                        # with another content. We'll need to change it
+                        v[3] = True
+                foundRefNames[name] = 1
+            groupdict[content] = v
+
+        id = 1
+        while self.autogen + str(id) in foundRefNames:
+            id += 1
+        for (g, d) in foundRefs.iteritems():
+            if g:
+                group = u"group=\"%s\" " % group
+            else:
+                group = u""
+
+            for (k, v) in d.iteritems():
+                if len(v[1]) == 1 and not v[3]:
+                    continue
+                name = v[0]
+                if not name:
+                    name = self.autogen + str(id)
+                    id += 1
+                elif v[2]:
+                    name = u'"%s"' % name
+                named = u'<ref %sname=%s>%s</ref>' % (group, name, k)
+                text = text.replace(v[1][0], named, 1)
+
+                # make sure that the first (named ref) is not
+                # removed later :
+                pos = text.index(named) + len(named)
+                header = text[:pos]
+                end = text[pos:]
+
+                unnamed = u'<ref %sname=%s />' % (group, name)
+                for ref in v[1][1:]:
+                    end = end.replace(ref, unnamed)
+                text = header + end
+
+        for (k, v) in namedRepl.iteritems():
+            # TODO : Support ref groups
+            name = v[0]
+            if v[1]:
+                name = u'"%s"' % name
+            text = re.sub(
+                u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k,
+                u'<ref name=%s />' % name, text)
+        return text
+
+
+class ReferencesRobot:
+
+    def __init__(self, generator, acceptall=False, limit=None, 
ignorepdf=False):
+        """
+        - generator : Page generator
+        - acceptall : boolean, is -always on ?
+        - limit : int, stop after n modified pages
+        - ignorepdf : boolean
+
+        """
+        self.generator = generator
+        self.acceptall = acceptall
+        self.limit = limit
+        self.ignorepdf = ignorepdf
+        self.site = pywikibot.getSite()
+        # Check
+        manual = 'mw:Manual:Pywikibot/refLinks'
+        if self.site.family.name == 'wikipedia':
+            manual = pywikibot.translate(self.site.code, manual)
+        else:
+            code = None
+            for alt in [self.site.code] + i18n._altlang(self.site.code):
+                if alt in localized_msg:
+                    code = alt
+                    break
+            if code:
+                manual += '/%s' % code
+        self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())
+        self.stopPage = pywikibot.Page(self.site,
+                                       pywikibot.translate(self.site, 
stopPage))
+
+        local = pywikibot.translate(self.site, badtitles)
+        if local:
+            bad = '(' + globalbadtitles + '|' + local + ')'
+        else:
+            bad = globalbadtitles
+        self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
+        self.norefbot = noreferences.NoReferencesBot(None)
+        self.deduplicator = DuplicateReferences()
+        try:
+            self.stopPageRevId = self.stopPage.latestRevision()
+        except pywikibot.NoPage:
+            pywikibot.output(u'The stop page %s does not exist'
+                             % self.stopPage.title(asLink=True))
+            raise
+
+        # Regex to grasp content-type meta HTML tag in HTML source
+        self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
+        # Extract the encoding from a charset property (from content-type !)
+        self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)')
+        # Extract html title from page
+        self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
+        # Matches content inside <script>/<style>/HTML comments
+        self.NON_HTML = re.compile(
+            
ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
+
+        # Authorized mime types for HTML pages
+        self.MIME = re.compile(
+            ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
+
+    def put_page(self, page, new):
+        """ Prints diffs between orginal and new (text), puts new text for page
+
+        """
+        pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+                         % page.title())
+        pywikibot.showDiff(page.get(), new)
+        if not self.acceptall:
+            choice = pywikibot.inputChoice(u'Do you want to accept ' +
+                                           u'these changes?',
+                                           ['Yes', 'No', 'All'],
+                                           ['y', 'N', 'a'], 'N')
+            if choice == 'a':
+                self.acceptall = True
+            if choice == 'y':
+                page.put_async(new, self.msg)
+        if self.acceptall:
+            try:
+                page.put(new, self.msg)
+            except pywikibot.EditConflict:
+                pywikibot.output(u'Skipping %s because of edit conflict'
+                                  % (page.title(),))
+            except pywikibot.SpamfilterError, e:
+                pywikibot.output(
+                    u'Cannot change %s because of blacklist entry %s'
+                    % (page.title(), e.url))
+            except pywikibot.PageNotSaved, error:
+                pywikibot.error(u'putting page: %s' % (error.args,))
+            except pywikibot.LockedPage:
+                pywikibot.output(u'Skipping %s (locked page)'
+                                  % (page.title(),))
+            except pywikibot.ServerError, e:
+                pywikibot.output(u'Server Error : %s' % e)
+
+    def httpError(self, err_num, link, pagetitleaslink):
+        """Log HTTP Error"""
+        pywikibot.output(u'HTTP error (%s) for %s on %s'
+                         % (err_num, link, pagetitleaslink), toStdout=True)
+
+    def getPDFTitle(self, ref, f):
+        """ Use pdfinfo to retrieve title from a PDF.
+        Unix-only, I'm afraid.
+
+        """
+        pywikibot.output(u'PDF file.')
+        fd, infile = tempfile.mkstemp()
+        urlobj = os.fdopen(fd, 'r+w')
+        urlobj.write(f.read())
+        try:
+            pdfinfo_out = subprocess.Popen([r"pdfinfo", "/dev/stdin"],
+                                           stdin=urlobj, 
stdout=subprocess.PIPE,
+                                           stderr=subprocess.PIPE,
+                                           shell=False).communicate()[0]
+            for aline in pdfinfo_out.splitlines():
+                if aline.lower().startswith('title'):
+                    ref.title = aline.split(None)[1:]
+                    ref.title = ' '.join(ref.title)
+                    if ref.title != '':
+                        pywikibot.output(u'title: %s' % ref.title)
+            pywikibot.output(u'PDF done.')
+        except ValueError:
+            pywikibot.output(u'pdfinfo value error.')
+        except OSError:
+            pywikibot.output(u'pdfinfo OS error.')
+        except:  # Ignore errors
+            pywikibot.output(u'PDF processing error.')
+            pass
+        finally:
+            urlobj.close()
+            os.unlink(infile)
+
+    def run(self):
+        """ Runs the Bot """
+        try:
+            deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
+        except IOError:
+            pywikibot.output(
+                'You need to download '
+                'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
+                'and to ungzip it in the same directory')
+            raise
+        socket.setdefaulttimeout(30)
+        editedpages = 0
+        for page in self.generator:
+            try:
+                # Load the page's text from the wiki
+                new_text = page.get()
+                if not page.canBeEdited():
+                    pywikibot.output(u"You can't edit page %s"
+                                      % page.title(asLink=True))
+                    continue
+            except pywikibot.NoPage:
+                pywikibot.output(u'Page %s not found' % 
page.title(asLink=True))
+                continue
+            except pywikibot.IsRedirectPage:
+                pywikibot.output(u'Page %s is a redirect'
+                                 % page.title(asLink=True))
+                continue
+
+            # for each link to change
+            for match in linksInRef.finditer(
+                    pywikibot.removeDisabledParts(page.get())):
+
+                link = match.group(u'url')
+                #debugging purpose
+                #print link
+                if u'jstor.org' in link:
+                    #TODO: Clean URL blacklist
+                    continue
+
+                ref = RefLink(link, match.group('name'))
+                f = None
+                try:
+                    socket.setdefaulttimeout(20)
+                    try:
+                        f = urllib2.urlopen(ref.url.decode("utf8"))
+                    except UnicodeError:
+                        ref.url = urllib2.quote(ref.url.encode("utf8"), "://")
+                        f = urllib2.urlopen(ref.url)
+                    #Try to get Content-Type from server
+                    headers = f.info()
+                    contentType = headers.getheader('Content-Type')
+                    if contentType and not self.MIME.search(contentType):
+                        if ref.link.lower().endswith('.pdf') and \
+                           not self.ignorepdf:
+                            # If file has a PDF suffix
+                            self.getPDFTitle(ref, f)
+                        else:
+                            pywikibot.output(
+                                u'\03{lightyellow}WARNING\03{default} : '
+                                u'media : %s ' % ref.link)
+                        if ref.title:
+                            if not re.match(
+                                    u'(?i) *microsoft (word|excel|visio)',
+                                    ref.title):
+                                ref.transform(ispdf=True)
+                                repl = ref.refTitle()
+                            else:
+                                pywikibot.output(
+                                    u'\03{lightyellow}WARNING\03{default} : '
+                                    u'PDF title blacklisted : %s ' % ref.title)
+                                repl = ref.refLink()
+                        else:
+                            repl = ref.refLink()
+                        new_text = new_text.replace(match.group(), repl)
+                        continue
+                    # Get the real url where we end (http redirects !)
+                    redir = f.geturl()
+                    if redir != ref.link and \
+                       domain.findall(redir) == domain.findall(link):
+                        if soft404.search(redir) and \
+                           not soft404.search(ref.link):
+                            pywikibot.output(
+                                u'\03{lightyellow}WARNING\03{default} : '
+                                u'Redirect 404 : %s ' % ref.link)
+                            continue
+                        if dirIndex.match(redir) and \
+                           not dirIndex.match(ref.link):
+                            pywikibot.output(
+                                u'\03{lightyellow}WARNING\03{default} : '
+                                u'Redirect to root : %s ' % ref.link)
+                            continue
+
+                    # uncompress if necessary
+                    if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
+                        # XXX: small issue here: the whole page is downloaded
+                        # through f.read(). It might fetch big files/pages.
+                        # However, truncating an encoded gzipped stream is not
+                        # an option, for unzipping will fail.
+                        compressed = StringIO.StringIO(f.read())
+                        f = gzip.GzipFile(fileobj=compressed)
+
+                    # Read the first 1,000,000 bytes (0.95 MB)
+                    linkedpagetext = f.read(1000000)
+                    socket.setdefaulttimeout(None)
+
+                except UnicodeError:
+                    # example : 
http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
+                    # in [[fr:Cyanure]]
+                    pywikibot.output(
+                        u'\03{lightred}Bad link\03{default} : %s in %s'
+                        % (ref.url, page.title(asLink=True)))
+                    continue
+                except urllib2.HTTPError, e:
+                    pywikibot.output(u'HTTP error (%s) for %s on %s'
+                                     % (e.code, ref.url,
+                                        page.title(asLink=True)),
+                                     toStdout=True)
+                    # 410 Gone, indicates that the resource has been purposely
+                    # removed
+                    if e.code == 410 or \
+                       (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
+                        repl = ref.refDead()
+                        new_text = new_text.replace(match.group(), repl)
+                    continue
+                except (urllib2.URLError,
+                        socket.error,
+                        IOError,
+                        httplib.error), e:
+                    pywikibot.output(u'Can\'t retrieve page %s : %s'
+                                     % (ref.url, e))
+                    continue
+                except ValueError:
+                    # Known bug of httplib, google for :
+                    # "httplib raises ValueError reading chunked content"
+                    continue
+                finally:
+                    if f:
+                        f.close()
+
+                #remove <script>/<style>/comments/CDATA tags
+                linkedpagetext = self.NON_HTML.sub('', linkedpagetext)
+
+                meta_content = self.META_CONTENT.search(linkedpagetext)
+                enc = []
+                s = None
+                if contentType:
+                    # use charset from http header
+                    s = self.CHARSET.search(contentType)
+                if meta_content:
+                    tag = meta_content.group()
+                    # Prefer the contentType from the HTTP header :
+                    if not contentType:
+                        contentType = tag
+                    if not s:
+                        # use charset from html
+                        s = self.CHARSET.search(tag)
+                if s:
+                    tmp = s.group('enc').strip("\"' ").lower()
+                    naked = re.sub('[ _\-]', '', tmp)
+                    # Convert to python correct encoding names
+                    if naked == "gb2312":
+                        enc.append("gbk")
+                    elif naked == "shiftjis":
+                        enc.append("shift jis 2004")
+                        enc.append("cp932")
+                    elif naked == "xeucjp":
+                        enc.append("euc-jp")
+                    else:
+                        enc.append(tmp)
+                else:
+                    pywikibot.output(u'No charset found for %s' % ref.link)
+##                    continue  # do not process pages without charset
+                if not contentType:
+                    pywikibot.output(u'No content-type found for %s' % 
ref.link)
+                    continue
+                elif not self.MIME.search(contentType):
+                    pywikibot.output(
+                        u'\03{lightyellow}WARNING\03{default} : media : %s '
+                        % ref.link)
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    continue
+
+                # Ugly hacks to try to survive when both server and page
+                # return no encoding.
+                # Uses most used encodings for each national suffix
+                if u'.ru' in ref.link or u'.su' in ref.link:
+                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
+                    # encoding, no page encoding
+                    enc = enc + ['koi8-r', 'windows-1251']
+                elif u'.jp' in ref.link:
+                    enc.append("shift jis 2004")
+                    enc.append("cp932")
+                elif u'.kr' in ref.link:
+                    enc.append("euc-kr")
+                    enc.append("cp949")
+                elif u'.zh' in ref.link:
+                    enc.append("gbk")
+
+                u = linkedpagetext
+
+                # Retrieves the first non empty string inside <title> tags
+                for m in self.TITLE.finditer(u):
+                    t = m.group()
+                    if t:
+                        ref.title = t
+                        ref.transform()
+                        if ref.title:
+                            break
+
+                if not ref.title:
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    pywikibot.output(u'%s : No title found...' % ref.link)
+                    continue
+
+                # XXX Ugly hack
+                if u'Ã©' in ref.title:
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    pywikibot.output(u'%s : Hybrid encoding...' % ref.link)
+                    continue
+
+                if self.titleBlackList.match(ref.title):
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    pywikibot.output(u'\03{lightred}WARNING\03{default} %s : '
+                                     u'Blacklisted title (%s)'
+                                     % (ref.link, ref.title))
+                    continue
+
+                # Truncate long titles. 175 is arbitrary
+                if len(ref.title) > 175:
+                    ref.title = ref.title[:175] + "..."
+
+                repl = ref.refTitle()
+                new_text = new_text.replace(match.group(), repl)
+
+            # Add <references/> when needed, but ignore templates !
+            if page.namespace != 10:
+                if self.norefbot.lacksReferences(new_text, verbose=False):
+                    new_text = self.norefbot.addReferences(new_text)
+
+            new_text = self.deduplicator.process(new_text)
+
+            if new_text == page.get():
+                pywikibot.output('No changes were necessary in %s'
+                                 % page.title(asLink=True))
+                continue
+
+            editedpages += 1
+            self.put_page(page, new_text)
+
+            if self.limit and editedpages >= self.limit:
+                pywikibot.output('Edited %s pages, stopping.' % self.limit)
+                return
+
+            if editedpages % 20 == 0:
+                pywikibot.output(
+                    '\03{lightgreen}Checking stop page...\03{default}')
+                actualRev = self.stopPage.latestRevision()
+                if actualRev != self.stopPageRevId:
+                    pywikibot.output(
+                        u'[[%s]] has been edited : Someone wants us to stop.'
+                        % self.stopPage)
+                    return
+
+
+def main():
+    genFactory = pagegenerators.GeneratorFactory()
+
+    PageTitles = []
+    xmlFilename = None
+    always = False
+    ignorepdf = False
+    limit = None
+    namespaces = []
+    generator = None
+    for arg in pywikibot.handleArgs():
+        if arg.startswith('-namespace:'):
+            try:
+                namespaces.append(int(arg[11:]))
+            except ValueError:
+                namespaces.append(arg[11:])
+        elif arg.startswith('-summary:'):
+            pywikibot.setAction(arg[9:])
+        elif arg == '-always':
+            always = True
+        elif arg == '-ignorepdf':
+            ignorepdf = True
+        elif arg.startswith('-limit:'):
+            limit = int(arg[7:])
+        elif arg.startswith('-xmlstart'):
+            if len(arg) == 9:
+                xmlStart = pywikibot.input(
+                    u'Please enter the dumped article to start with:')
+            else:
+                xmlStart = arg[10:]
+        elif arg.startswith('-xml'):
+            if len(arg) == 4:
+                xmlFilename = pywikibot.input(
+                    u'Please enter the XML dump\'s filename:')
+            else:
+                xmlFilename = arg[5:]
+        else:
+            genFactory.handleArg(arg)
+
+    if xmlFilename:
+        try:
+            xmlStart
+        except NameError:
+            xmlStart = None
+        generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+    if not generator:
+        generator = genFactory.getCombinedGenerator()
+    if not generator:
+        # syntax error, show help text from the top of this file
+        pywikibot.showHelp('reflinks')
+        return
+    generator = pagegenerators.PreloadingGenerator(generator, pageNumber=50)
+    generator = pagegenerators.RedirectFilterPageGenerator(generator)
+    bot = ReferencesRobot(generator, always, limit, ignorepdf)
+    bot.run()
+
+if __name__ == "__main__":
+    try:
+        main()
+    finally:
+        pywikibot.stopme()

-- 
To view, visit https://gerrit.wikimedia.org/r/96976
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7355849f997c8615fac2e77df6d9cbfc9c5d1e19
Gerrit-PatchSet: 11
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mayankmadan <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Legoktm <[email protected]>
Gerrit-Reviewer: Mayankmadan <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Strainu <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Ported reflinks.py to core - change (pywikibot/core)

Reply via email to