John Vandenberg has uploaded a new change for review.
https://gerrit.wikimedia.org/r/190143
Change subject: Improve template regex
......................................................................
Improve template regex
Change-Id: Iab46aaafac3c1367a68c900bcb33217cf67f1126
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 40 insertions(+), 2 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/43/190143/1
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 145ea5f..3f17b1f 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -31,8 +31,17 @@
from pywikibot.family import Family
from pywikibot.tools import OrderedDict
-TEMP_REGEX = re.compile(
-
'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
+TEMPLATE_REGEX = re.compile(r"""
+{{\s*(?:msg:)?
+ (?P<name>[^{\|]+?)\s*
+ (?:\|(?P<params>[^{]+?
+ (?:(\{\{\{\w+\}\}\}|{[^{]+?})[^{]*?)*
+ )
+ )?
+}}
+""", re.VERBOSE)
+TEMP_REGEX = TEMPLATE_REGEX # deprecated
+
NON_LATIN_DIGITS = {
'ckb': u'٠١٢٣٤٥٦٧٨٩',
'fa': u'۰۱۲۳۴۵۶۷۸۹',
@@ -1014,6 +1023,30 @@
return result
+def extract_one_template_and_params_regex(text):
+ """
+ Extract one template with params using a regex.
+ @param text: The wikitext from which templates are extracted
+ @type text: unicode or string
+ @return: template name and params
+ @rtype: tuple of name and params
+ """
+ match = TEMPLATE_REGEX.match(text)
+ name, params = match.group(1), match.group(2)
+
+ numbered_param_identifiers = iter(range(1, params.split('|')))
+
+ params = OrderedDict(
+ values.partition('=')[0::2]
+ if isinstance(values, basestring) and '=' in values
+ else (str(next(numbered_param_identifiers)), values)
+ if isinstance(values, basestring)
+ else ('', '')
+ for values in (params.split('|') if params else []))
+
+ return name, params
+
+
def extract_templates_and_params_regex(text):
"""
Extract templates with params using a regex.
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 903c34f..2836ffe 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -221,6 +221,7 @@
self.assertEqual(func('{{a|b={{{1}}}}}'), [('a', OrderedDict((('b',
'{{{1}}}'), )))])
self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
[('a', OrderedDict((('b',
'<noinclude>{{{1}}}</noinclude>'), )))])
+ self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'), [('a',
OrderedDict((('b', '{{{1}}}'), ('c', '{{{2}}}'))))])
self.assertEqual(func('{{subst:a|b=c}}'), [('subst:a',
OrderedDict((('b', 'c'), )))])
self.assertEqual(func('{{safesubst:a|b=c}}'), [('safesubst:a',
OrderedDict((('b', 'c'), )))])
self.assertEqual(func('{{msgnw:a|b=c}}'), [('msgnw:a',
OrderedDict((('b', 'c'), )))])
@@ -252,6 +253,10 @@
self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), [('a',
OrderedDict((('b', '<!--{{{1}}}-->'), )))])
+ def test_template_regex(self):
+ func = lambda x: [textlib.extract_one_template_and_params_regex(x)]
+ self._extract_templates_params(func)
+
def test_extract_templates_params_regex(self):
func = textlib.extract_templates_and_params_regex
self._extract_templates_params(func)
--
To view, visit https://gerrit.wikimedia.org/r/190143
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iab46aaafac3c1367a68c900bcb33217cf67f1126
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits