Revision: 7350
Author: siebrand
Date: 2009-10-02 09:34:45 +0000 (Fri, 02 Oct 2009)
Log Message:
-----------
additional features copied from fixes.py
Patch by xqt
Modified Paths:
--------------
trunk/pywikipedia/cosmetic_changes.py
Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================
--- trunk/pywikipedia/cosmetic_changes.py 2009-10-02 09:34:16 UTC (rev
7349)
+++ trunk/pywikipedia/cosmetic_changes.py 2009-10-02 09:34:45 UTC (rev
7350)
@@ -188,6 +188,8 @@
text = self.validXhtml(text)
text = self.removeUselessSpaces(text)
text = self.removeNonBreakingSpaceBeforePercent(text)
+ text = self.fixSyntaxSave(text)
+ text = self.fixHtml(text)
try:
text = isbn.hyphenateIsbnNumbers(text)
except isbn.InvalidIsbnException, error:
@@ -424,6 +426,39 @@
text = wikipedia.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' +
template + '(?P<parameters>\|[^}]+|)}}', '', ['comment', 'math', 'nowiki',
'pre'])
return text
+ #from fixes.py
+ def fixSyntaxSave(self, text):
+ exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
'startspace']
+ # external link in double brackets
+ text = wikipedia.replaceExcept(text,
r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions)
+ # external link starting with double bracket
+ text = wikipedia.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]',
r'[\g<url>]', exceptions)
+ # external link and description separated by a dash, with
+ # whitespace in front of the dash, so that it is clear that
+ # the dash is not a legitimate part of the URL.
+ text = wikipedia.replaceExcept(text, r'\[(?P<url>https?://[^\|\]
\r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions)
+ # dash in external link, where the correct end of the URL can
+ # be detected from the file extension. It is very unlikely that
+ # this will cause mistakes.
+ text = wikipedia.replaceExcept(text, r'\[(?P<url>https?://[^\|\]
]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]',
r'[\g<url> \g<label>]', exceptions)
+ return text
+
+ def fixHtml(self, text):
+ # Everything case-insensitive (?i)
+ # Keep in mind that MediaWiki automatically converts <br> to <br />
+ exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
'startspace']
+ text = wikipedia.replaceExcept(text, r'(?i)<b>(.*?)</b>', r"'''\1'''"
, exceptions)
+ text = wikipedia.replaceExcept(text, r'(?i)<strong>(.*?)</strong>',
r"'''\1'''" , exceptions)
+ text = wikipedia.replaceExcept(text, r'(?i)<i>(.*?)</i>', r"''\1''" ,
exceptions)
+ text = wikipedia.replaceExcept(text, r'(?i)<em>(.*?)</em>', r"''\1''"
, exceptions)
+ # horizontal line without attributes in a single line
+ text = wikipedia.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
r'\1----\2', exceptions)
+ # horizontal line with attributes; can't be done with wiki syntax
+ # so we only make it XHTML compliant
+ text = wikipedia.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1
/>', exceptions)
+ # TODO: maybe we can make the bot replace <p> tags with \r\n's.
+ return text
+
class CosmeticChangesBot:
def __init__(self, generator, acceptall = False):
self.generator = generator
_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn