Revision: 8576
Author:   xqt
Date:     2010-09-19 17:10:15 +0000 (Sun, 19 Sep 2010)

Log Message:
-----------
update cc from trunk

Modified Paths:
--------------
    branches/rewrite/scripts/cosmetic_changes.py

Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================
--- branches/rewrite/scripts/cosmetic_changes.py        2010-09-19 16:42:57 UTC 
(rev 8575)
+++ branches/rewrite/scripts/cosmetic_changes.py        2010-09-19 17:10:15 UTC 
(rev 8576)
@@ -32,7 +32,8 @@
 all of them, but be careful if you do.
 """
 __version__ = '$Id$'
-import pywikibot, isbn
+import pywikibot
+import isbn
 from pywikibot import pagegenerators
 import sys
 import re
@@ -49,6 +50,7 @@
 
 # Summary message when using this module as a stand-alone script
 msg_standalone = {
+    'commons': u'Bot: [[Commons talk:Tools/pywiki file description 
cleanup|desc page fmt]]',
     'als':u'Bötli: chleineri Änderige',
     'ar': u'روبوت: تغييرات تجميلية',
     'be-x-old': u'Робат: касмэтычныя зьмены',
@@ -111,6 +113,7 @@
 # Summary message  that will be appended to the normal message when
 # cosmetic changes are made on the fly
 msg_append = {
+    'commons': u'; [[Commons talk:Tools/pywiki file description cleanup|desc 
page fmt]]',
     'als':u'; chleineri Änderige',
     'ar': u'; تغييرات تجميلية',
     'be-x-old': u'; касмэтычныя зьмены',
@@ -170,33 +173,97 @@
     'zh': u'; 細部更改',
 }
 
+nn_iw_msg = u'<!--interwiki (no, sv, da first; then other languages 
alphabetically by name)-->'
+
+# This is from interwiki.py;
+# move it to family file and implement global instances
+moved_links = {
+    'ca' : (u'ús de la plantilla', u'/ús'),
+    'cs' : (u'dokumentace',   u'/doc'),
+    'de' : (u'dokumentation', u'/Meta'),
+    'en' : ([u'documentation',
+             u'template documentation',
+             u'template doc',
+             u'doc',
+             u'documentation, template'], u'/doc'),
+    'es' : ([u'documentación', u'documentación de plantilla'], u'/doc'),
+    'fr' : (u'/documentation', u'/Documentation'),
+    'hu' : (u'sablondokumentáció', u'/doc'),
+    'id' : (u'template doc',  u'/doc'),
+    'ja' : (u'documentation', u'/doc'),
+    'ka' : (u'თარგის ინფო',   u'/ინფო'),
+    'ko' : (u'documentation', u'/설명문서'),
+    'ms' : (u'documentation', u'/doc'),
+    'pl' : (u'dokumentacja',  u'/opis'),
+    'pt' : ([u'documentação', u'/doc'],  u'/doc'),
+    'ro' : (u'documentaţie',  u'/doc'),
+    'ru' : (u'doc',           u'/doc'),
+    'sv' : (u'dokumentation', u'/dok'),
+    'vi' : (u'documentation', u'/doc'),
+    'zh' : ([u'documentation', u'doc'], u'/doc'),
+}
+
+# Template which should be replaced or removed.
+# Use a list with two entries. The first entry will be replaced by the second.
+# Examples:
+# For removing {{Foo}}, the list must be:
+#           (u'Foo', None),
+#
+# The following also works:
+#           (u'Foo', ''),
+#
+# For replacing {{Foo}} with {{Bar}} the list must be:
+#           (u'Foo', u'Bar'),
+#
+# This also removes all template parameters of {{Foo}}
+# For replacing {{Foo}} with {{Bar}} but keep the template
+# parameters in its original order, please use:
+#           (u'Foo', u'Bar\g<parameters>'),
+
+deprecatedTemplates = {
+    'wikipedia': {
+        'de': [
+            (u'Belege', u'Belege fehlen\g<parameters>'),
+            (u'Quelle', u'Belege fehlen\g<parameters>'),
+            (u'Quellen', u'Belege fehlen\g<parameters>'),
+            (u'Quellen fehlen', u'Belege fehlen\g<parameters>'),
+        ],
+    }
+}
+
 class CosmeticChangesToolkit:
-    def __init__(self, site, debug=False, redirect=False, namespace=None):
+    def __init__(self, site, debug=False, redirect=False, namespace=None, 
pageTitle=None):
         self.site = site
         self.debug = debug
         self.redirect = redirect
         self.namespace = namespace
         self.template = (self.namespace == 10)
         self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1
+        self.title = pageTitle
 
     def change(self, text):
         """
         Given a wiki source code text, return the cleaned up version.
         """
         oldText = text
+        if self.site.sitename()== u'commons:commons' and self.namespace == 6:
+            text = self.commonsfiledesc(text)
         text = self.fixSelfInterwiki(text)
-        text = self.standardizeInterwiki(text)
-        text = self.standardizeCategories(text)
+        text = self.standardizePageFooter(text)
         text = self.cleanUpLinks(text)
         text = self.cleanUpSectionHeaders(text)
         text = self.putSpacesInLists(text)
         text = self.translateAndCapitalizeNamespaces(text)
+        text = self.replaceDeprecatedTemplates(text)
         text = self.resolveHtmlEntities(text)
         text = self.validXhtml(text)
         text = self.removeUselessSpaces(text)
         text = self.removeNonBreakingSpaceBeforePercent(text)
         text = self.fixSyntaxSave(text)
         text = self.fixHtml(text)
+        text = self.fixStyle(text)
+        text = self.fixTypo(text)
+        text = self.fixArabicLetters(text)
         try:
             text = isbn.hyphenateIsbnNumbers(text)
         except isbn.InvalidIsbnException, error:
@@ -210,21 +277,13 @@
         Interwiki links to the site itself are displayed like local links.
         Remove their language code prefix.
         """
-        interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]' % self.site.lang)
-        text = interwikiR.sub(r'[[\1]]', text)
-        return text
-
-    def standardizeInterwiki(self, text):
-        """
-        Makes sure that interwiki links are put to the correct position and
-        into the right order.
-        """
         if not self.talkpage and pywikibot.calledModuleName() <> 'interwiki':
-            interwikiLinks = pywikibot.getLanguageLinks(text, insite = 
self.site)
-            text = pywikibot.replaceLanguageLinks(text, interwikiLinks, site = 
self.site, template = self.template)
+            interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]' % 
self.site.lang)
+            text = interwikiR.sub(r'[[\1]]', text)
         return text
 
-    def standardizeCategories(self, text):
+
+    def standardizePageFooter(self, text):
         """
         Makes sure that categories are put to the correct position, but
         does not sort them.
@@ -256,7 +315,12 @@
                 continue
             namespaces = list(family.namespace(self.site.lang, nsNumber, all = 
True))
             thisNs = namespaces.pop(0)
-
+            if nsNumber == 6 and family.name == 'wikipedia' and \
+               self.site.lang in ('en', 'fr'):
+                # do not change "Image" on en-wiki and fr-wiki
+                for image in [u'Image', u'image']:
+                    if image in namespaces:
+                        namespaces.remove(image)
             # skip main (article) namespace
             if thisNs and namespaces:
                 text = pywikibot.replaceExcept(text, r'\[\[\s*(' + 
'|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + 
':\g<nameAndLabel>]]', exceptions)
@@ -435,10 +499,26 @@
         and French Wikipedia. It might be that it is not wanted on other wikis.
         If there are any complaints, please file a bug report.
         """
-        if not self.redirect:
-            text = pywikibot.replaceExcept(text, 
r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> 
\g<char>', ['comment', 'math', 'nowiki', 'pre'])
+        exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'timeline']
+        if not self.redirect and pywikibot.calledModuleName() <> 
'capitalize_redirects':
+            text = pywikibot.replaceExcept(text, 
r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> 
\g<char>', exceptions)
         return text
 
+    def replaceDeprecatedTemplates(self, text):
+        exceptions = ['comment', 'math', 'nowiki', 'pre']
+        if self.site.family.name in deprecatedTemplates and self.site.lang in 
deprecatedTemplates[self.site.family.name]:
+            for template in 
deprecatedTemplates[self.site.family.name][self.site.lang]:
+                old = template[0]
+                new = template[1]
+                if new == None:
+                    new = ''
+                else:
+                    new = '{{'+new+'}}'
+                if not self.site.nocapitalize:
+                    old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:]
+                text = pywikibot.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' + 
old + '(?P<parameters>\|[^}]+|)}}', new, exceptions)
+        return text
+
     #from fixes.py
     def fixSyntaxSave(self, text):
         exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 
'startspace']
@@ -469,11 +549,131 @@
         # horizontal line with attributes; can't be done with wiki syntax
         # so we only make it XHTML compliant
         text = pywikibot.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 
/>', exceptions)
+        # a header where only spaces are in the same line
+        for level in range(1, 7):
+            equals = '\\1%s \\2 %s\\3' % ("="*level, "="*level)
+            text = pywikibot.replaceExcept(text,
+                                           r'(?i)([\r\n]) *<h%d> *([^<]+?) 
*</h%d> *([\r\n])'%(level, level),
+                                           r'%s'%equals, exceptions)
+        #remove empty <ref/>-tag
+        text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'', exceptions)
         # TODO: maybe we can make the bot replace <p> tags with \r\n's.
         return text
 
+    def fixStyle(self, text):
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 
'startspace']
+        # convert prettytable to wikitable class
+        if self.site.language in ('de', 'en'):
+           text = pywikibot.replaceExcept(text, 
ur'(class="[^"]*)prettytable([^"]*")', ur'\1wikitable\2', exceptions)
+        return text
+
+    def fixTypo(self, text):
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 
'startspace', 'gallery', 'hyperlink', 'interwiki', 'link']
+        # change <number> ccm -> <number> cm³
+        text = pywikibot.replaceExcept(text, ur'(\d)\s*&nbsp;ccm', 
ur'\1&nbsp;cm³', exceptions)
+        text = pywikibot.replaceExcept(text, ur'(\d)\s*ccm', ur'\1&nbsp;cm³', 
exceptions)
+        # Solve wrong Nº sign with °C or °F
+        # additional exception requested on fr-wiki for this stuff
+        pattern = re.compile(u'«.*?»', re.UNICODE)
+        exceptions.append(pattern)
+        text = pywikibot.replaceExcept(text, ur'(\d)\s*&nbsp;[º°]([CF])', 
ur'\1&nbsp;°\2', exceptions)
+        text = pywikibot.replaceExcept(text, ur'(\d)\s*[º°]([CF])', 
ur'\1&nbsp;°\2', exceptions)
+        text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1', exceptions)
+        return text
+
+    def fixArabicLetters(self, text):
+        if self.site.lang=='ckb':
+            exceptions = [
+                'gallery',
+                'hyperlink',
+                'interwiki',
+                # but changes letters inside wikilinks
+                #'link',
+                'math',
+                'pre',
+                'template',
+                'timeline',
+                'ref',
+                'source',
+                'startspace',
+                'inputbox',
+            ]
+            # do not change inside file links
+            namespaces = list(self.site.namespace(6, all = True))
+            pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + 
'):.+?\..+?\]\]', re.UNICODE)
+            exceptions.append(pattern)
+            text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
+            text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', 
exceptions)
+            text = pywikibot.replaceExcept(text, u'ه‌', u'ە', exceptions)
+            text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
+            text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
+            text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions)
+            # replace persian digits
+            for i in range(0,10):
+                text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i], 
u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
+            # do not change digits in class, style and table params
+            pattern = re.compile(u'=".*?"', re.UNICODE)
+            exceptions.append(pattern)
+            # do not change digits inside html-tags
+            pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
+            exceptions.append(pattern)
+            for i in range(0,10):
+                text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i], 
exceptions)
+        return text
+
+    # Retrieved from 
"http://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup";
+    def commonsfiledesc(self, text):
+        # section headers to {{int:}} versions
+        exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki',
+                      'pre', 'source', 'ref', 'timeline']
+        text = pywikibot.replaceExcept(text,
+                                       r"([\r\n]|^)\=\= *Summary *\=\=",
+                                       r"\1== {{int:filedesc}} ==",
+                                       exceptions, True)
+        text = pywikibot.replaceExcept(
+            text,
+            r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=",
+            r"\1== {{int:license}} ==", exceptions, True)
+        text = pywikibot.replaceExcept(
+            text,
+            r"([\r\n])\=\= *(Licensing|License 
information|{{int:license-header}}) *\=\=",
+            r"\1== {{int:license}} ==", exceptions, True)
+ 
+        # frequent field values to {{int:}} versions
+        text = pywikibot.replaceExcept(
+            text,
+            r'([\r\n]\|[Ss]ource *\= *)(?:[Oo]wn work by uploader|[Oo]wn 
work|[Ee]igene [Aa]rbeit) *([\r\n])',
+            r'\1{{own}}\2', exceptions, True)
+        text = pywikibot.replaceExcept(
+            text,
+            r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])',
+            r'\1\2', exceptions, True)
+ 
+        # added to transwikied pages
+        text = pywikibot.replaceExcept(text, r'__NOTOC__', '', exceptions, 
True)
+ 
+        # tracker element for js upload form
+        text = pywikibot.replaceExcept(
+            text,
+            r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->',
+            '', exceptions[1:], True)
+        text = pywikibot.replaceExcept(text, 
r'{{ImageUpload\|(?:basic|full)}}',
+                                       '', exceptions, True)
+ 
+        # duplicated section headers
+        text = pywikibot.replaceExcept(
+            text,
+            r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= 
*{{int:filedesc}} *\=\=',
+            r'\1== {{int:filedesc}} ==', exceptions, True)
+        text = pywikibot.replaceExcept(
+            text,
+            r'([\r\n]|^)\=\= *{{int:license}} *\=\=(?:[\r\n ]*)\=\= 
*{{int:license}} *\=\=',
+            r'\1== {{int:license}} ==', exceptions, True)
+        return text
+
 class CosmeticChangesBot:
-    def __init__(self, generator, acceptall = False, comment=u'Robot: Cosmetic 
changes'):
+    def __init__(self, generator, acceptall = False,
+                 comment=u'Robot: Cosmetic changes'):
         self.generator = generator
         self.acceptall = acceptall
         self.comment = comment
@@ -483,13 +683,17 @@
         try:
             # Show the title of the page we're working on.
             # Highlight the title in purple.
-            pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % 
page.title())
-            ccToolkit = CosmeticChangesToolkit(page.site, debug = True, 
namespace = page.namespace())
+            pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+                             % page.title())
+            ccToolkit = CosmeticChangesToolkit(page.site, debug=True,
+                                               namespace=page.namespace(),
+                                               pageTitle=page.title())
             changedText = ccToolkit.change(page.get())
             if changedText.strip() != page.get().strip():
                 if not self.acceptall:
-                    choice = pywikibot.inputChoice(u'Do you want to accept 
these changes?',
-                                                   ['Yes', 'No', 'All', 
'Quit'], ['y', 'N', 'a', 'q'], 'N')
+                    choice = pywikibot.inputChoice(
+                        u'Do you want to accept these changes?',
+                        ['Yes', 'No', 'All', 'Quit'], ['y', 'N', 'a', 'q'], 
'N')
                     if choice == 'a':
                         self.acceptall = True
                     elif choice == 'q':
@@ -498,15 +702,19 @@
                 if self.acceptall or choice == 'y':
                     page.put(changedText, comment=self.comment)
             else:
-                pywikibot.output('No changes were necessary in %s' % 
page.title())
+                pywikibot.output('No changes were necessary in %s'
+                                 % page.title())
         except pywikibot.NoPage:
-            pywikibot.output("Page %s does not exist?!" % page.aslink())
+            pywikibot.output("Page %s does not exist?!"
+                             % page.title(asLink=True))
         except pywikibot.IsRedirectPage:
-            pywikibot.output("Page %s is a redirect; skipping." % 
page.aslink())
+            pywikibot.output("Page %s is a redirect; skipping."
+                             % page.title(asLink=True))
         except pywikibot.LockedPage:
-            pywikibot.output("Page %s is locked?!" % page.aslink())
+            pywikibot.output("Page %s is locked?!" % page.title(asLink=True))
         except pywikibot.EditConflict:
-            pywikibot.output("An edit conflict has occured at %s." % 
page.aslink())
+            pywikibot.output("An edit conflict has occured at %s."
+                             % page.title(asLink=True))
 
     def run(self):
         try:
@@ -540,16 +748,6 @@
     if editSummary == '':
         # Load default summary message.
         editSummary = pywikibot.translate(pywikibot.getSite(), msg_standalone)
-
-    # Disabled this check. Although the point is still valid, there
-    # is now a warning and a prompt (see below).
-    #if pywikibot.getSite() == pywikibot.getSite('nl','wikipedia'):
-        #print "Deze bot is op WikipediaNL niet gewenst."
-        #print "Het toevoegen van cosmetic changes bij andere wijzigingen is 
toegestaan,"
-        #print "maar cosmetic_changes als stand-alone bot niet."
-        #print "Zoek alstublieft een nuttig gebruik voor uw bot."
-        #sys.exit()
-
     if pageTitle:
         site = pywikibot.getSite()
         gen = iter([pywikibot.Page(pywikibot.Link(t, site)) for t in 
pageTitle])
@@ -558,11 +756,14 @@
     if not gen:
         pywikibot.showHelp()
     elif not always:
-        answer = pywikibot.inputChoice(warning + '\nDo you really want to 
continue?', ['yes', 'no'], ['y', 'N'], 'N')
+        answer = pywikibot.inputChoice(
+            warning + '\nDo you really want to continue?',
+            ['yes', 'no'], ['y', 'N'], 'N')
 
     if answer == 'y':
         preloadingGen = pagegenerators.PreloadingGenerator(gen)
-        bot = CosmeticChangesBot(preloadingGen, acceptall=always, 
comment=editSummary)
+        bot = CosmeticChangesBot(preloadingGen, acceptall=always,
+                                 comment=editSummary)
         bot.run()
 
 if __name__ == "__main__":



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

Reply via email to