John Vandenberg has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/190143

Change subject: Improve template regex
......................................................................

Improve template regex

Change-Id: Iab46aaafac3c1367a68c900bcb33217cf67f1126
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 40 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/43/190143/1

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 145ea5f..3f17b1f 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -31,8 +31,17 @@
 from pywikibot.family import Family
 from pywikibot.tools import OrderedDict
 
-TEMP_REGEX = re.compile(
-    
'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
+TEMPLATE_REGEX = re.compile(r"""
+{{\s*(?:msg:)?
+  (?P<name>[^{\|]+?)\s*
+  (?:\|(?P<params>[^{]+?
+        (?:(\{\{\{\w+\}\}\}|{[^{]+?})[^{]*?)*
+       )
+  )?
+}}
+""", re.VERBOSE)
+TEMP_REGEX = TEMPLATE_REGEX  # deprecated
+
 NON_LATIN_DIGITS = {
     'ckb': u'٠١٢٣٤٥٦٧٨٩',
     'fa': u'۰۱۲۳۴۵۶۷۸۹',
@@ -1014,6 +1023,30 @@
     return result
 
 
+def extract_one_template_and_params_regex(text):
+    """
+    Extract one template with params using a regex.
+    @param text: The wikitext from which templates are extracted
+    @type text: unicode or string
+    @return: template name and params
+    @rtype: tuple of name and params
+    """
+    match = TEMPLATE_REGEX.match(text)
+    name, params = match.group(1), match.group(2)
+
+    numbered_param_identifiers = iter(range(1, params.split('|')))
+
+    params = OrderedDict(
+        values.partition('=')[0::2]
+        if isinstance(values, basestring) and '=' in values
+        else (str(next(numbered_param_identifiers)), values)
+        if isinstance(values, basestring)
+        else ('', '')
+        for values in (params.split('|') if params else []))
+
+    return name, params
+
+
 def extract_templates_and_params_regex(text):
     """
     Extract templates with params using a regex.
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 903c34f..2836ffe 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -221,6 +221,7 @@
         self.assertEqual(func('{{a|b={{{1}}}}}'), [('a', OrderedDict((('b', 
'{{{1}}}'), )))])
         self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
                          [('a', OrderedDict((('b', 
'<noinclude>{{{1}}}</noinclude>'), )))])
+        self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'), [('a', 
OrderedDict((('b', '{{{1}}}'), ('c', '{{{2}}}'))))])
         self.assertEqual(func('{{subst:a|b=c}}'), [('subst:a', 
OrderedDict((('b', 'c'), )))])
         self.assertEqual(func('{{safesubst:a|b=c}}'), [('safesubst:a', 
OrderedDict((('b', 'c'), )))])
         self.assertEqual(func('{{msgnw:a|b=c}}'), [('msgnw:a', 
OrderedDict((('b', 'c'), )))])
@@ -252,6 +253,10 @@
 
         self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), [('a', 
OrderedDict((('b', '<!--{{{1}}}-->'), )))])
 
+    def test_template_regex(self):
+        func = lambda x: [textlib.extract_one_template_and_params_regex(x)]
+        self._extract_templates_params(func)
+
     def test_extract_templates_params_regex(self):
         func = textlib.extract_templates_and_params_regex
         self._extract_templates_params(func)

-- 
To view, visit https://gerrit.wikimedia.org/r/190143
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iab46aaafac3c1367a68c900bcb33217cf67f1126
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to