textlib.py

xqt Thu, 04 Apr 2013 00:29:36 -0700

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11335


Revision: 11335
Author:   xqt
Date:     2013-04-04 07:29:29 +0000 (Thu, 04 Apr 2013)
Log Message:
-----------
update from trunk r11333

Modified Paths:
--------------
    branches/rewrite/pywikibot/textlib.py

Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py       2013-04-04 06:30:35 UTC (rev 
11334)
+++ branches/rewrite/pywikibot/textlib.py       2013-04-04 07:29:29 UTC (rev 
11335)
@@ -19,6 +19,7 @@
 from HTMLParser import HTMLParser
 import config2 as config
 
+TEMP_REGEX = re.compile('{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
 
 def unescape(s):
     """Replace escaped HTML-special characters by their originals"""
@@ -75,14 +76,6 @@
         # source code readability.
         # TODO: handle nested tables.
         'table':        re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
-        # templates with parameters often have whitespace that is used to
-        # improve wiki source code readability.
-        # 'template':    re.compile(r'(?s){{.*?}}'),
-        # The regex above fails on nested templates. This regex can handle
-        # templates cascaded up to level 2, but no deeper. For arbitrary
-        # depth, we'd need recursion which can't be done in Python's re.
-        # After all, the language of correct parenthesis words is not regular.
-        'template':     re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
         'hyperlink':    compileLinkR(),
         'gallery':      re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
         # this matches internal wikilinks, but also interwiki, categories, and
@@ -107,12 +100,15 @@
             old = re.compile(old)
 
     dontTouchRegexes = []
+    except_templates = False
     for exc in exceptions:
         if isinstance(exc, basestring):
             # assume it's a reference to the exceptionRegexes dictionary
             # defined above.
             if exc in exceptionRegexes:
                 dontTouchRegexes.append(exceptionRegexes[exc])
+            elif exc == 'template':
+                except_templates = True
             else:
                 # nowiki, noinclude, includeonly, timeline, math ond other
                 # extensions
@@ -125,6 +121,35 @@
         else:
             # assume it's a regular expression
             dontTouchRegexes.append(exc)
+
+    # mark templates
+    # don't care about mw variables and parser functions
+    if except_templates:
+        marker1 = findmarker(text)
+        marker2 = findmarker(text, u'##', u'#')
+        Rvalue = re.compile('{{{.+?}}}')
+        Rmarker1 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker1})
+        Rmarker2 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker2})
+        values = {}
+        count = 0
+        for m in Rvalue.finditer(text):
+            count += 1
+            item = m.group()
+            text = text.replace(item, '%s%d%s' % (marker2, count, marker2))
+            values[count] = item
+        inside = {}
+        count = 0
+        while TEMP_REGEX.search(text) is not None:
+            for m in TEMP_REGEX.finditer(text):
+                count += 1
+                item = m.group()
+                text = text.replace(item, '%s%d%s' % (marker1, count, marker1))
+
+                for m2 in Rmarker1.finditer(item):
+                    item = item.replace(m2.group(), inside[int(m2.group(1))])
+                for m2 in Rmarker2.finditer(item):
+                    item = item.replace(m2.group(), values[int(m2.group(1))])
+                inside[count] = item
     index = 0
     markerpos = len(text)
     while True:
@@ -194,6 +219,12 @@
                 index = match.start() + len(replacement)
             markerpos = match.start() + len(replacement)
     text = text[:markerpos] + marker + text[markerpos:]
+
+    if except_templates:  # restore templates from dict
+        for m2 in Rmarker1.finditer(text):
+            text = text.replace(m2.group(), inside[int(m2.group(1))])
+        for m2 in Rmarker2.finditer(text):
+            text = text.replace(m2.group(), values[int(m2.group(1))])
     return text
 
 
@@ -831,7 +862,7 @@
 #----------------------------------
 
 def extract_templates_and_params(text):
-    """Return list of template calls found in text.
+    """Return a list of templates found in text.
 
     Return value is a list of tuples. There is one tuple for each use of a
     template in the page, with the template title as the first entry and a
@@ -840,6 +871,8 @@
     with an integer value corresponding to its position among the unnnamed
     parameters, and if this results multiple parameters with the same name
     only the last value provided will be returned.
+    @param text: The wikitext from which templates are extracted
+    @type text: unicode or string
 
     """
     # remove commented-out stuff etc.
@@ -858,8 +891,6 @@
     marker4 = findmarker(thistxt, u'§§', u'§')
 
     result = []
-    Rtemplate = re.compile(
-        ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
     Rmath = re.compile(ur'<math>[^<]+</math>')
     Rvalue = re.compile(r'{{{.+?}}}')
     Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
@@ -886,8 +917,8 @@
 
     inside = {}
     count = 0
-    while Rtemplate.search(thistxt) is not None:
-        for m in Rtemplate.finditer(thistxt):
+    while TEMP_REGEX.search(thistxt) is not None:
+        for m in TEMP_REGEX.finditer(thistxt):
             # Make sure it is not detected again
             count += 1
             text = m.group()
@@ -909,6 +940,35 @@
                 # Doesn't detect templates whose name changes,
                 # or templates whose name contains math tags
                 continue
+
+            # {{#if: }}
+            if name.startswith('#'):
+                continue
+
+## TODO: merged from wikipedia.py - implement the following
+##            if self.site().isInterwikiLink(name):
+##                continue
+##            # {{DEFAULTSORT:...}}
+##            defaultKeys = self.site().versionnumber() > 13 and \
+##                          self.site().getmagicwords('defaultsort')
+##            # It seems some wikis does not have this magic key
+##            if defaultKeys:
+##                found = False
+##                for key in defaultKeys:
+##                    if name.startswith(key):
+##                        found = True
+##                        break
+##                if found: continue
+##
+##            try:
+##                name = Page(self.site(), name).title()
+##            except InvalidTitle:
+##                if name:
+##                    output(
+##                        u"Page %s contains invalid template name {{%s}}."
+##                       % (self.title(), name.strip()))
+##                continue
+
             # Parameters
             paramString = m.group('params')
             params = {}


_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [11335] branches/rewrite/pywikibot/textlib.py

Reply via email to