Revision: 7920
Author:   xqt
Date:     2010-02-09 14:34:01 +0000 (Tue, 09 Feb 2010)

Log Message:
-----------
family/wikipedia: read redirect tags via API (dict removed)
cc: add fixArabicLetters
solve_disambiguation: update exception list
pywikibot: bugfixes

Modified Paths:
--------------
    trunk/pywikipedia/cosmetic_changes.py
    trunk/pywikipedia/family.py
    trunk/pywikipedia/pywikibot/__init__.py
    trunk/pywikipedia/pywikibot/textlib.py
    trunk/pywikipedia/solve_disambiguation.py
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================
--- trunk/pywikipedia/cosmetic_changes.py       2010-02-08 15:37:02 UTC (rev 
7919)
+++ trunk/pywikipedia/cosmetic_changes.py       2010-02-09 14:34:01 UTC (rev 
7920)
@@ -275,6 +275,7 @@
         text = self.fixHtml(text)
         text = self.fixStyle(text)
         text = self.fixTypo(text)
+        text = self.fixArabicLetters(text)
         try:
             text = isbn.hyphenateIsbnNumbers(text)
         except isbn.InvalidIsbnException, error:
@@ -679,6 +680,40 @@
         text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1', exceptions)
         return text
 
+    def fixArabicLetters(self, text):
+        if self.site.lang=='ckb':
+            exceptions = [
+                'gallery',
+                'hyperlink',
+                'interwiki',
+                'link',
+                'math',
+                'pre',
+                'template',
+                'timeline',
+                'ref',
+                'source',
+                'startspace',
+            ]
+            text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
+            text = pywikibot.replaceExcept(text, ur'ه([.، ])', ur'ە\1', 
exceptions)
+            text = pywikibot.replaceExcept(text, u'ه‌', u'ە', exceptions)
+            text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
+            text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
+            text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions)
+            # replace persian digits
+            for i in range(0,10):
+                text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i], 
u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
+            # do not change digits in class, style and table params
+            pattern = re.compile(u'=".*?"', re.UNICODE)
+            exceptions.append(pattern)
+            # do not change digits inside html-tags
+            pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
+            exceptions.append(pattern)
+            for i in range(0,10):
+                text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i], 
exceptions)
+        return text
+
 class CosmeticChangesBot:
     def __init__(self, generator, acceptall = False, comment=u'Robot: Cosmetic 
changes'):
         self.generator = generator

Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/family.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -3449,178 +3449,6 @@
     def category_namespaces(self, code):
         return self.namespace(code, 14, all = True)
 
-    # Localised magic words for language code 'xyz' can be found in
-    # the MediaWiki source code in the file
-    # /mediawiki/trunk/phase3/languages/messages/MessagesXyz.php
-    # in the 'magicwords' array
-    
-    # Localised redirect codes
-
-    # Note that redirect codes are case-insensitive, so it is enough
-    # to enter the code in lowercase here.
-
-    # When creating a redirect page, only the first item is looked for.
-    # When matching for redirects, default 'redirect' is always inserted
-    # => if default redirect keyword used for a language is not 'redirect',
-    #    it is not necessary to add 'redirect' at the end of the list
-    redirect = {
-        'ab': [u'перенаправление', u'перенапр', u'redirect'],
-        'ace': [u'alih'],
-        'af': [u'aanstuur'],
-        'aln': [u'ridrejto'],
-        'als': [u'weiterleitung'],
-        'an': [u'redirección'],
-        'ar': [u'تحويل'],
-        'arn': [u'redirección'],
-        'arz': [u'تحويل'],
-        'av': [u'перенаправление', u'перенапр'],
-        'ay': [u'redirección'],
-        'ba': [u'перенаправление', u'перенапр'],
-        'bar': [u'weiterleitung'],
-        'bat-smg': [u'peradresavimas'],
-        'bcc': [u'تغییرمسیر'],
-        'be-tarask': [u'перанакіраваньне'],
-        'be-x-old': [u'перанакіраваньне'],
-        'bg': [u'виж', u'пренасочване'],
-        'bm': [u'redirection'],
-        'bqi': [u'تغییرمسیر'],
-        'br': [u'adkas'],
-        'bug': [u'alih'],
-        'bs': [u'preusmjeri'],
-        'cbk-zam': [u'redirección'],
-        'ce': [u'перенаправление', u'перенапр'],
-        'cs': [u'přesměruj'],
-        'cu': [u'прѣнаправлєниѥ'],
-        'cv': [u'перенаправление', u'перенапр'],
-        'cy': [u'ail-cyfeirio', u'ailgyfeirio'],
-        'de': [u'weiterleitung'],
-        'de-at': [u'weiterleitung'],
-        'de-ch': [u'weiterleitung'],
-        'de-formal': [u'weiterleitung'],
-        'dsb': [u'weiterleitung'],
-        'el': [u'ανακατευθυνση'],
-        'eml': [u'rinvia', u'rinvio'],
-        'eo': [u'alidirektu'],
-        'es': [u'redirección'],
-        'et': [u'suuna'],
-        'eu': [u'birzuzendu'],
-        'fa': [u'تغییرمسیر'],
-        'ff': [u'redirection'],
-        'fi': [u'ohjaus', u'uudelleenohjaus'],
-        'fiu-vro': [u'saadaq'],
-        'fr': [u'redirection'],
-        'frp': [u'redirèccion', u'redirection'],
-        'fur': [u'rinvia', u'rinvio'],
-        'ga': [u'athsheoladh'],
-        'gag': [u'yönlendirme'],
-        'gl': [u'redirección'],
-        'glk': [u'تغییرمسیر'],
-        'gn': [u'redirección'],
-        'gsw': [u'weiterleitung'],
-        'he': [u'הפניה'],
-        'hr': [u'preusmjeri'],
-        'hsb': [u'weiterleitung'],
-        'ht': [u'redirection'],
-        'hu': [u'átirányítás'],
-        'hy': [u'վերահղում'],
-        'id': [u'alih'],
-        'inh': [u'перенаправление', u'перенапр'],
-        'is': [u'tilvísun'],
-        'it': [u'rinvia', u'rinvio'],
-        'ja': [u'転送', u'リダイレクト'],
-        'jv': [u'alih'],
-        'ka': [u'გადამისამართება'],
-        'kaa': [u'aýdaw', u'айдау'],
-        'kk': [u'айдау'],
-        'kk-arab': [u'ايداۋ'],
-        'kk-cyrl': [u'АЙДАУ'],
-        'kk-latn': [u'aýdaw', u'айдау'],
-        'km': [u'\u1794\u1789\u17d2\u1787\u17bc\u1793\u1794\u1793\u17d2\u178f',
-               
u'\u1794\u17d2\u178f\u17bc\u179a\u1791\u17b8\u178f\u17b6\u17c6\u1784',
-               
u'\u1794\u17d2\u178a\u17bc\u179a\u1785\u17c6\u178e\u1784\u1787\u17be\u1784',
-               u'ប្តូរទីតាំងទៅ'],
-        'ko': [u'넘겨주기'],
-        'ksh': [u'ömleide op', u'ömleidung'],
-        'kv': [u'перенаправление', u'перенапр'],
-        'lad': [u'redirección'],
-        'lb': [u'weiterleitung'],
-        'lbe': [u'перенаправление', u'перенапр'],
-        'li': [u'doorverwijzing'],
-        'lij': [u'rinvia', u'rinvio'],
-        'lld': [u'rinvia', u'rinvio'],
-        'lmo': [u'rinvia', u'rinvio'],
-        'ln': [u'redirection'],
-        'lt': [u'peradresavimas'],
-        'map-bms': [u'alih'],
-        'mg': [u'redirection'],
-        'mhr': [u'перенаправление', u'перенапр'],
-        'mk': [u'пренасочување', u'види'],
-        'ml': [u'തിരിച്ചുവിടുക', u'തിരിച്ചുവിടല്‍'],
-        'mo': [u'redirecteaza'],
-        'mr': [u'पुनर्निर्देशन'],
-        'mt': [u'rindirizza'],
-        'mwl': [u'ancaminar'],
-        'myv': [u'перенаправление', u'перенапр'],
-        'mzn': [u'تغییرمسیر'],
-        'nah': [u'redirección'],
-        'nap': [u'rinvia'],
-        'nds': [u'wiederleiden', u'weiterleitung'],
-        'nds-nl': [u'deurverwiezing', u'doorverwijzing'],
-        'new': [u'पुनर्निर्देश'],
-        'nl': [u'doorverwijzing'],
-        'nn': [u'omdiriger'],
-        'no': [u'omdirigering'],
-        'oc': [u'redireccion'],
-        'os': [u'рарвыст', u'перенаправление', u'перенапр'],
-        'pdc': [u'weiterleitung'],
-        'pl': [u'patrz', u'przekieruj', u'tam'],
-        'pms': [u'rinvia', u'rinvio'],
-        'pt': [u'redirecionamento'],
-        'pt-br': [u'redirecionamento'],
-        'qu': [u'pusapuna', u'redirección'],
-        'rmy': [u'redirecteaza'],
-        'ro': [u'redirecteaza'],
-        'ru': [u'перенаправление', u'перенапр'],
-        'sa': [u'पुनर्निदेशन'],
-        'sah': [u'перенаправление', u'перенапр'],
-        'scn': [u'rinvia', u'rinvio'],
-        'sd': [u'چوريو'],
-        'sg': [u'redirection'],
-        'shi': [u'تحويل'],
-        'si': [u'යළියොමුව'],
-        'sk': [u'presmeruj'],
-        'sl': [u'preusmeritev'],
-        'sli': [u'weiterleitung'],
-        'sq': [u'ridrejto'],
-        'sr': [u'преусмери', u'преусмери'],
-        'sr-ec': [u'преусмери'],
-        'sr-el': [u'preusmeri'],
-        'srn': [u'stir', u'doorverwijzing'],
-        'stq': [u'weiterleitung'],
-        'su': [u'alih'],
-        'sv': [u'omdirigering'],
-        'szl': [u'patrz', u'przekieruj', u'tam'],
-        'ta': [u'வழிமாற்று'],
-        'te': [u'దారిమార్పు'],
-        'th': [u'เปลี่ยนทาง'],
-        'tr': [u'yönlendirme'],
-        'tt': [u'yünältü'],
-        'tt-latn': [u'yünältü'],
-        'tt-cyrl': [u'перенаправление', u'перенапр'],
-        'ty': [u'redirection'],
-        'udm': [u'перенаправление', u'перенапр'],
-        'uk': [u'перенаправлення', u'перенаправление', u'перенапр'],
-        'vec': [u'rinvia', u'rinvio'],
-        'vep': [u'suuna'],
-        'vi': [u'đổi', u'đổi'],
-        'vls': [u'doorverwijzing'],
-        'vro': [u'saadaq', u'suuna'],
-        'wa': [u'redirection'],
-        'wo': [u'redirection'],
-        'yi': [u'ווייטערפירן', u'הפניה'],
-        'zea': [u'doorverwijzing']
-    }
-
     # So can be pagename code
     pagename = {
         'bg': [u'СТРАНИЦА'],

Modified: trunk/pywikipedia/pywikibot/__init__.py
===================================================================
--- trunk/pywikipedia/pywikibot/__init__.py     2010-02-08 15:37:02 UTC (rev 
7919)
+++ trunk/pywikipedia/pywikibot/__init__.py     2010-02-09 14:34:01 UTC (rev 
7920)
@@ -16,7 +16,9 @@
 
 import wikipedia
 
+link_regex = re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
 
+
 def showDiff(oldtext, newtext):
     """
     Output a string showing the differences between oldtext and newtext.

Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py      2010-02-08 15:37:02 UTC (rev 
7919)
+++ trunk/pywikipedia/pywikibot/textlib.py      2010-02-09 14:34:01 UTC (rev 
7920)
@@ -198,14 +198,19 @@
     'parts' parameter, which defaults to all.
     """
     regexes = {
-            'comments' :   r'<!--.*?-->',
-            'includeonly': r'<includeonly>.*?</includeonly>',
-            'nowiki':      r'<nowiki>.*?</nowiki>',
-            'pre':         r'<pre>.*?</pre>',
-            'source':      r'<source .*?</source>',
+            'comments' :       r'<!--.*?-->',
+            'includeonly':     r'<includeonly>.*?</includeonly>',
+            'nowiki':          r'<nowiki>.*?</nowiki>',
+            'pre':             r'<pre>.*?</pre>',
+            'source':          r'<source .*?</source>',
+            'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
     }
     if '*' in tags:
         tags = regexes.keys()
+    # add alias
+    tags = set(tags)
+    if 'source' in tags:
+        tags.add('syntaxhighlight')
     toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
                            re.IGNORECASE | re.DOTALL)
     return toRemoveR.sub('', text)
@@ -254,9 +259,9 @@
         marker = text[firstinseparator:firstinmarker] + marker
     return marker
 
-
+#-------------------------------------------------
 # Functions dealing with interwiki language links
-
+#-------------------------------------------------
 # Note - MediaWiki supports two kinds of interwiki links; interlanguage and
 #        interproject.  These functions only deal with links to a
 #        corresponding page in another language on the same project (e.g.,
@@ -302,8 +307,8 @@
             site = insite.getSite(code = lang)
             try:
                 result[site] = pywikibot.Page(site, pagetitle, insite = insite)
-            except InvalidTitle:
-                output(
+            except pywikibot.InvalidTitle:
+                pywikibot.output(
         u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
                            % (lang, pagetitle))
                 continue
@@ -486,8 +491,9 @@
         sites = insite.interwiki_putfirst_doubled(sites) + sites
     return sites
 
-
+#---------------------------------------
 # Functions dealing with category links
+#---------------------------------------
 
 def getCategoryLinks(text, site):
     import catlib
@@ -665,6 +671,9 @@
     #catLinks.sort()
     return sep.join(catLinks) + '\r\n'
 
+#---------------------------------------
+# Functions dealing with external links
+#---------------------------------------
 
 def compileLinkR(withoutBracketed=False, onlyBracketed=False):
     """Return a regex that matches external links."""
@@ -695,6 +704,9 @@
     linkR = re.compile(regex)
     return linkR
 
+#----------------------------------
+# Functions dealing with templates
+#----------------------------------
 
 def extract_templates_and_params(text, get_redirect=False):
     """Return list of template calls found in text.
@@ -805,7 +817,9 @@
             result.append((name, params))
     return result
 
+#----------------
 # I18N functions
+#----------------
 
 # Languages to use for comment text after the actual language but before
 # en:. For example, if for language 'xx', you want the preference of

Modified: trunk/pywikipedia/solve_disambiguation.py
===================================================================
--- trunk/pywikipedia/solve_disambiguation.py   2010-02-08 15:37:02 UTC (rev 
7919)
+++ trunk/pywikipedia/solve_disambiguation.py   2010-02-09 14:34:01 UTC (rev 
7920)
@@ -266,12 +266,12 @@
             u'Benutzer:SrbBot.*',
             u'Benutzer:PortalBot/.+',
             u'Benutzer:Xqbot/.+',
-            u'Benutzer Diskussion:.+',
             u'Lehnwort',
             u'Liste griechischer Wortstämme in deutschen Fremdwörtern',
             u'Liste von Gräzismen',
             u'Portal:Abkürzungen/.+',
             u'Portal:Astronomie/Moves',
+            u'Portal:Astronomie/Index/.+',
             u'Wikipedia:Administratoren/Anfragen',
             u'Wikipedia:Archiv/.+',
             u'Wikipedia:Artikelwünsche/Ding-Liste/[A-Z]',

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py      2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/wikipedia.py      2010-02-09 14:34:01 UTC (rev 7920)
@@ -7437,10 +7437,12 @@
         special redirect tag.
 
         """
-        if default:
-            return self.family.redirect.get(self.lang, [u"REDIRECT"])[0]
-        else:
-            return self.family.redirect.get(self.lang, None)
+        tag = self.siteinfo('magicwords').get('redirect')[0][1:]
+        if tag:
+            # remove first "#" letter
+            return tag[0][1:]
+        elif default:
+            return u'REDIRECT'
 
     def redirectRegex(self):
         """Return a compiled regular expression matching on redirect pages.
@@ -7448,24 +7450,23 @@
         Group 1 in the regex match object will be the target title.
 
         """
-
+        #NOTE: this is needed, since the API can give false positives!
+        default = 'REDIRECT'
         try:
-            redirKeywords = [u'redirect'] + self.family.redirect[self.lang]
-            redirKeywordsR = r'(?:' + '|'.join(redirKeywords) + ')'
+            keywords = self.siteinfo('magicwords')['redirect']
+            pattern = r'(?:' + '|'.join(keywords) + ')'
         except KeyError:
             # no localized keyword for redirects
-            redirKeywordsR = r'redirect'
-
-        # A redirect starts with hash (#), followed by a keyword, then
-        # arbitrary stuff, then a wikilink. The wikilink may contain
-        # a label, although this is not useful.
-
+            pattern = r'#%s' % default
         if self.versionnumber() > 12:
             # in MW 1.13 (at least) a redirect directive can follow whitespace
             prefix = r'\s*'
         else:
             prefix = r'[\r\n]*'
-        return re.compile(prefix + '#' + redirKeywordsR
+        # A redirect starts with hash (#), followed by a keyword, then
+        # arbitrary stuff, then a wikilink. The wikilink may contain
+        # a label, although this is not useful.
+        return re.compile(prefix + pattern
                                  + '\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]',
                           re.IGNORECASE | re.UNICODE | re.DOTALL)
 



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

Reply via email to