Revision: 7350
Author:   siebrand
Date:     2009-10-02 09:34:45 +0000 (Fri, 02 Oct 2009)

Log Message:
-----------
additional features copied from fixes.py

Patch by xqt

Modified Paths:
--------------
    trunk/pywikipedia/cosmetic_changes.py

Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================
--- trunk/pywikipedia/cosmetic_changes.py       2009-10-02 09:34:16 UTC (rev 
7349)
+++ trunk/pywikipedia/cosmetic_changes.py       2009-10-02 09:34:45 UTC (rev 
7350)
@@ -188,6 +188,8 @@
         text = self.validXhtml(text)
         text = self.removeUselessSpaces(text)
         text = self.removeNonBreakingSpaceBeforePercent(text)
+        text = self.fixSyntaxSave(text)
+        text = self.fixHtml(text)
         try:
             text = isbn.hyphenateIsbnNumbers(text)
         except isbn.InvalidIsbnException, error:
@@ -424,6 +426,39 @@
                 text = wikipedia.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' + 
template + '(?P<parameters>\|[^}]+|)}}', '', ['comment', 'math', 'nowiki', 
'pre'])
         return text
 
+    #from fixes.py
+    def fixSyntaxSave(self, text):
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 
'startspace']
+        # external link in double brackets
+        text = wikipedia.replaceExcept(text, 
r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions)
+        # external link starting with double bracket
+        text = wikipedia.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]', 
r'[\g<url>]', exceptions)
+        # external link and description separated by a dash, with
+        # whitespace in front of the dash, so that it is clear that
+        # the dash is not a legitimate part of the URL.
+        text = wikipedia.replaceExcept(text, r'\[(?P<url>https?://[^\|\] 
\r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions)
+        # dash in external link, where the correct end of the URL can
+        # be detected from the file extension. It is very unlikely that
+        # this will cause mistakes.
+        text = wikipedia.replaceExcept(text, r'\[(?P<url>https?://[^\|\] 
]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', 
r'[\g<url> \g<label>]', exceptions)
+        return text
+
+    def fixHtml(self, text):
+        # Everything case-insensitive (?i)
+        # Keep in mind that MediaWiki automatically converts <br> to <br />
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 
'startspace']
+        text = wikipedia.replaceExcept(text, r'(?i)<b>(.*?)</b>', r"'''\1'''" 
, exceptions)
+        text = wikipedia.replaceExcept(text, r'(?i)<strong>(.*?)</strong>', 
r"'''\1'''" , exceptions)
+        text = wikipedia.replaceExcept(text, r'(?i)<i>(.*?)</i>', r"''\1''" , 
exceptions)
+        text = wikipedia.replaceExcept(text, r'(?i)<em>(.*?)</em>', r"''\1''" 
, exceptions)
+        # horizontal line without attributes in a single line
+        text = wikipedia.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', 
r'\1----\2', exceptions)
+        # horizontal line with attributes; can't be done with wiki syntax
+        # so we only make it XHTML compliant
+        text = wikipedia.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 
/>', exceptions)
+        # TODO: maybe we can make the bot replace <p> tags with \r\n's.
+        return text
+
 class CosmeticChangesBot:
     def __init__(self, generator, acceptall = False):
         self.generator = generator



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

Reply via email to