[MediaWiki-commits] [Gerrit] Introduce new tested template regex - change (pywikibot/core)

jenkins-bot (Code Review) Fri, 19 Jun 2015 15:43:01 -0700

jenkins-bot has submitted this change and it was merged.

Change subject: Introduce new tested template regex
......................................................................



Introduce new tested template regex

Provide another template regex that supports collecting nested
templates, with tests, and use it for replaceExcept.

New basic tests for the template regex used by
extract_templates_and_params_regex, with minor fixes
to the regex.

Deprecate textlib.TEMP_REGEX and page.ip_regexp

Change-Id: Iab46aaafac3c1367a68c900bcb33217cf67f1126
---
M pywikibot/textlib.py
M pywikibot/tools/__init__.py
M pywikibot/tools/chars.py
M pywikibot/tools/ip.py
M tests/ipregex_tests.py
M tests/textlib_tests.py
6 files changed, 381 insertions(+), 114 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, but someone else must approve
  XZise: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 2fae512..6297846 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -28,18 +28,64 @@
 else:
     from HTMLParser import HTMLParser
 
+try:
+    import mwparserfromhell
+except ImportError as e:
+    mwparserfromhell = e
+
 import pywikibot
 
 from pywikibot import config2 as config
 from pywikibot.exceptions import InvalidTitle
 from pywikibot.family import Family
-from pywikibot.tools import OrderedDict
+from pywikibot.tools import OrderedDict, DeprecatedRegex
 
 # cache for replaceExcept to avoid recompile or regexes each call
 _regex_cache = {}
 
-TEMP_REGEX = re.compile(
-    
r'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
+# This regex is only for use by extract_templates_and_params_regex.
+# It does not support template variables consisting of nested templates,
+# system variables like {{CURRENTYEAR}}, or template variables like {{{1}}}.
+_ETP_REGEX = re.compile(
+    r'{{(?:msg:)?(?P<name>[^{\|]+?)'
+    r'(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?)?)?}}')
+
+# This template is a more inclusive template matching algorithm
+# that allows system variables, but does not match nested templates.
+# It exists for backwards compatibility to the old 'TEMP_REGEX'
+# which was the _ETP_REGEX.
+TEMP_REGEX = DeprecatedRegex(r"""
+{{\s*(?:msg:)?\s*
+  (?P<name>[^{\|]+?)\s*
+  (?:\|(?P<params>[^{]*
+        (?:(?:{}|{{[A-Z]+(?:\:[^}])?}}|{{{[^}]+}}}) [^{]*)*
+       )?
+  )?
+}}
+""", re.VERBOSE, 'textlib.TEMP_REGEX', 'textlib.NESTED_TEMPLATE_REGEX')
+
+# The regex below collects nested templates, providing simpler
+# identification of templates used at the top-level of wikitext.
+# It doesnt match {{{1|...}}}, however it also does not match templates
+# with a numerical name. e.g. {{1|..}}. It will correctly match {{{x}} as
+# being {{x}} with leading '{' left in the wikitext.
+# Prefix msg: is not included in the 'name' group, but all others are
+# included for backwards compatibility with TEMP_REGEX.
+# Only parser functions using # are excluded.
+NESTED_TEMPLATE_REGEX = re.compile(r"""
+{{\s*(?:msg:)?\s*
+  (?P<name>[^{\|#0-9][^{\|#0-9]*?)\s*
+  (?:\|(?P<params>[^{]*
+          (({{{[^}]+}}}
+           |{{[^}|]+\|?[^}]*}}
+           |{}
+           ) [^{]*
+          )*
+       )?
+  )?
+}}
+""", re.VERBOSE)
+
 
 NON_LATIN_DIGITS = {
     'ckb': u'٠١٢٣٤٥٦٧٨٩',
@@ -95,6 +141,7 @@
         'source':       re.compile(r'(?is)<source .*?</source>'),
         # inline references
         'ref':          re.compile(r'(?ism)<ref[ >].*?</ref>'),
+        'template':     NESTED_TEMPLATE_REGEX,
         # lines that start with a space are shown in a monospace font and
         # have whitespace preserved.
         'startspace':   re.compile(r'(?m)^ (.*?)$'),
@@ -151,9 +198,6 @@
                     result.append(_regex_cache[(exc, site)])
                 else:
                     result.append(_regex_cache[exc])
-            elif exc == 'template':
-                # template is not supported by this method.
-                pass
             else:
                 # nowiki, noinclude, includeonly, timeline, math ond other
                 # extensions
@@ -208,50 +252,6 @@
 
     dontTouchRegexes = _get_regexes(exceptions, site)
 
-    except_templates = 'template' in exceptions
-
-    # mark templates
-    # don't care about mw variables and parser functions
-    if except_templates:
-        marker1 = findmarker(text)
-        marker2 = findmarker(text, u'##', u'#')
-        Rvalue = re.compile('{{{.+?}}}')
-        Rmarker1 = re.compile(r'%(mark)s(\d+)%(mark)s' % {'mark': marker1})
-        Rmarker2 = re.compile(r'%(mark)s(\d+)%(mark)s' % {'mark': marker2})
-        # hide the flat template marker
-        dontTouchRegexes.append(Rmarker1)
-        origin = text
-        values = {}
-        count = 0
-        for m in Rvalue.finditer(text):
-            count += 1
-            # If we have digits between brackets, restoring from dict may fail.
-            # So we need to change the index. We have to search in the origin.
-            while u'}}}%d{{{' % count in origin:
-                count += 1
-            item = m.group()
-            text = text.replace(item, '%s%d%s' % (marker2, count, marker2))
-            values[count] = item
-        inside = {}
-        seen = set()
-        count = 0
-        while TEMP_REGEX.search(text) is not None:
-            for m in TEMP_REGEX.finditer(text):
-                item = m.group()
-                if item in seen:
-                    continue  # speed up
-                seen.add(item)
-                count += 1
-                while u'}}%d{{' % count in origin:
-                    count += 1
-                text = text.replace(item, '%s%d%s' % (marker1, count, marker1))
-
-                # Make sure stored templates don't contain markers
-                for m2 in Rmarker1.finditer(item):
-                    item = item.replace(m2.group(), inside[int(m2.group(1))])
-                for m2 in Rmarker2.finditer(item):
-                    item = item.replace(m2.group(), values[int(m2.group(1))])
-                inside[count] = item
     index = 0
     markerpos = len(text)
     while True:
@@ -330,12 +330,6 @@
                 index += 1
             markerpos = match.start() + len(replacement)
     text = text[:markerpos] + marker + text[markerpos:]
-
-    if except_templates:  # restore templates from dict
-        for m2 in Rmarker1.finditer(text):
-            text = text.replace(m2.group(), inside[int(m2.group(1))])
-        for m2 in Rmarker2.finditer(text):
-            text = text.replace(m2.group(), values[int(m2.group(1))])
     return text
 
 
@@ -1217,7 +1211,7 @@
 # Functions dealing with templates
 # --------------------------------
 
-def extract_templates_and_params(text):
+def extract_templates_and_params(text, remove_disabled_parts=None):
     """Return a list of templates found in text.
 
     Return value is a list of tuples. There is one tuple for each use of a
@@ -1242,20 +1236,28 @@
 
     @param text: The wikitext from which templates are extracted
     @type text: unicode or string
+    @param remove_disabled_parts: Remove disabled wikitext such as comments
+        and pre.  If None (default), this is enabled when mwparserfromhell
+        is not available or is disabled in the config, and disabled if
+        mwparserfromhell is present and enabled in the config.
+    @type remove_disabled_parts: bool or None
     @return: list of template name and params
     @rtype: list of tuple
     """
-    use_mwparserfromhell = config.use_mwparserfromhell
+    use_mwparserfromhell = (config.use_mwparserfromhell and
+                            not isinstance(mwparserfromhell, Exception))
+
     if use_mwparserfromhell:
-        try:
-            import mwparserfromhell  # noqa
-        except ImportError:
-            use_mwparserfromhell = False
+        if remove_disabled_parts is None:
+            remove_disabled_parts = False
+
+    if remove_disabled_parts:
+        text = removeDisabledParts(text)
 
     if use_mwparserfromhell:
         return extract_templates_and_params_mwpfh(text)
     else:
-        return extract_templates_and_params_regex(text)
+        return extract_templates_and_params_regex(text, False)
 
 
 def extract_templates_and_params_mwpfh(text):
@@ -1274,7 +1276,6 @@
     @return: list of template name and params
     @rtype: list of tuple
     """
-    import mwparserfromhell
     code = mwparserfromhell.parse(text)
     result = []
     for template in code.filter_templates(recursive=True):
@@ -1285,9 +1286,9 @@
     return result
 
 
-def extract_templates_and_params_regex(text):
+def extract_templates_and_params_regex(text, remove_disabled_parts=True):
     """
-    Extract templates with params using a regex.
+    Extract templates with params using a regex with additional processing.
 
     This function should not be called directly.
 
@@ -1301,7 +1302,10 @@
     @rtype: list of tuple
     """
     # remove commented-out stuff etc.
-    thistxt = removeDisabledParts(text)
+    if remove_disabled_parts:
+        thistxt = removeDisabledParts(text)
+    else:
+        thistxt = text
 
     # marker for inside templates or parameters
     marker1 = findmarker(thistxt)
@@ -1347,8 +1351,8 @@
     inside = {}
     seen = set()
     count = 0
-    while TEMP_REGEX.search(thistxt) is not None:
-        for m in TEMP_REGEX.finditer(thistxt):
+    while _ETP_REGEX.search(thistxt) is not None:
+        for m in _ETP_REGEX.finditer(thistxt):
             # Make sure it is not detected again
             item = m.group()
             if item in seen:
@@ -1445,8 +1449,54 @@
                                                       values[int(m2.group(1))])
                     params[param_name.strip()] = param_val.strip()
 
+            # Special case for {{a|}} which has an undetected parameter
+            if not params and '|' in m.group(0):
+                params = OrderedDict({'1': ''})
+
             # Add it to the result
             result.append((name, params))
+
+    return result
+
+
+def extract_templates_and_params_regex_simple(text):
+    """
+    Extract top-level templates with params using only a simple regex.
+
+    This function uses only a single regex, and returns
+    an entry for each template called at the top-level of the wikitext.
+    Nested templates are included in the argument values of the top-level
+    template.
+
+    This method will incorrectly split arguments when an
+    argument value contains a '|', such as {{template|a={{b|c}} }}.
+
+    @param text: The wikitext from which templates are extracted
+    @type text: unicode or string
+    @return: list of template name and params
+    @rtype: list of tuple of name and OrderedDict
+    """
+    result = []
+
+    for match in NESTED_TEMPLATE_REGEX.finditer(text):
+        name, params = match.group(1), match.group(2)
+
+        # Special case for {{a}}
+        if params is None:
+            params = []
+        else:
+            params = params.split('|')
+
+        numbered_param_identifiers = iter(range(1, len(params) + 1))
+
+        params = OrderedDict(
+            arg.split('=', 1)
+            if '=' in arg
+            else (str(next(numbered_param_identifiers)), arg)
+            for arg in params)
+
+        result.append((name, params))
+
     return result
 
 
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 8d9c4ff..1433820 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -229,18 +229,31 @@
 
 class LazyRegex(object):
 
-    """Regex object that compiles the regex on usage."""
+    """
+    Regex object that obtains and compiles the regex on usage.
 
-    def __init__(self):
-        """Constructor."""
-        self._raw = None
-        self._flags = None
-        self._compiled = None
+    Instances behave like the object created using L{re.compile}.
+    """
+
+    def __init__(self, pattern, flags=0):
+        """
+        Constructor.
+
+        @param pattern: L{re} regex pattern
+        @type pattern: str or callable
+        @param flags: L{re.compile} flags
+        @type flags: int
+        """
+        self.raw = pattern
+        self.flags = flags
         super(LazyRegex, self).__init__()
 
     @property
     def raw(self):
         """Get raw property."""
+        if callable(self._raw):
+            self._raw = self._raw()
+
         return self._raw
 
     @raw.setter
@@ -264,7 +277,7 @@
         """Compile the regex and delegate all attribute to the regex."""
         if self._raw:
             if not self._compiled:
-                self._compiled = re.compile(self._raw, self._flags)
+                self._compiled = re.compile(self.raw, self.flags)
 
             if hasattr(self._compiled, attr):
                 return getattr(self._compiled, attr)
@@ -275,6 +288,34 @@
             raise AttributeError('%s.raw not set' % self.__class__.__name__)
 
 
+class DeprecatedRegex(LazyRegex):
+
+    """Regex object that issues a deprecation notice."""
+
+    def __init__(self, pattern, flags=0, name=None, instead=None):
+        """
+        Constructor.
+
+        If name is None, the regex pattern will be used as part of
+        the deprecation warning.
+
+        @param name: name of the object that is deprecated
+        @type name: str or None
+        @param instead: if provided, will be used to specify the replacement
+            of the deprecated name
+        @type instead: str
+        """
+        super(DeprecatedRegex, self).__init__(pattern, flags)
+        self._name = name or self.raw
+        self._instead = instead
+
+    def __getattr__(self, attr):
+        """Issue deprecation warning."""
+        issue_deprecation_warning(
+            self._name, self._instead, 2)
+        return super(DeprecatedRegex, self).__getattr__(attr)
+
+
 def first_lower(string):
     """
     Return a string with the first character uncapitalized.
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index 9a29e24..f0e88b1 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -56,10 +56,9 @@
 # At the moment we've only added the characters from the Cf category
 _invisible_chars = frozenset(_category_cf)
 
-# TODO: Is that complex and a lazy regex justified?
-invisible_regex = LazyRegex()
-invisible_regex.raw = '[' + ''.join(_invisible_chars) + ']'
-invisible_regex.flags = 0
+invisible_regex = LazyRegex(
+    lambda: '[' + ''.join(_invisible_chars) + ']'
+)
 
 
 def contains_invisible(text):
diff --git a/pywikibot/tools/ip.py b/pywikibot/tools/ip.py
index 93983e7..41a5f41 100644
--- a/pywikibot/tools/ip.py
+++ b/pywikibot/tools/ip.py
@@ -16,7 +16,7 @@
 
 from warnings import warn
 
-from pywikibot.tools import LazyRegex
+from pywikibot.tools import DeprecatedRegex
 
 ipaddress_e = ipaddr_e = None
 
@@ -78,15 +78,15 @@
     ip_address = ip_address_fake
 
 # deprecated IP detector
-ip_regexp = LazyRegex()
-ip_regexp.flags = re.IGNORECASE
-ip_regexp.raw = (
+ip_regexp = DeprecatedRegex(
     r'^(?:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
     r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|'
     r'(((?=(?=(.*?(::)))\3(?!.+\4)))\4?|[\dA-F]{1,4}:)'
     r'([\dA-F]{1,4}(\4|:\b)|\2){5}'
     r'(([\dA-F]{1,4}(\4|:\b|$)|\2){2}|'
-    r'(((2[0-4]|1\d|[1-9])?\d|25[0-5])\.?\b){4}))\Z')
+    r'(((2[0-4]|1\d|[1-9])?\d|25[0-5])\.?\b){4}))\Z',
+    re.IGNORECASE,
+    'page.ip_regexp', 'tools.ip.is_IP')
 
 
 def is_IP(IP):
diff --git a/tests/ipregex_tests.py b/tests/ipregex_tests.py
index 6e4459e..6cc65d2 100644
--- a/tests/ipregex_tests.py
+++ b/tests/ipregex_tests.py
@@ -11,7 +11,7 @@
 
 from pywikibot.tools import ip
 
-from tests.aspects import unittest, TestCase
+from tests.aspects import unittest, TestCase, DeprecationTestCase
 from tests.utils import expected_failure_if
 
 
@@ -628,7 +628,7 @@
         self.ipv6test(False, "1111:2222:3333:4444:5555:6666:000.000.000.000")
 
 
-class IPRegexTestCase(TestIPBase):
+class IPRegexTestCase(TestIPBase, DeprecationTestCase):
 
     """Test IP regex."""
 
@@ -640,6 +640,8 @@
         self._run_tests()
         self._test_T76286_failures()
         self.assertEqual(self.fail, 0)
+        self.assertDeprecation(
+            'page.ip_regexp is deprecated, use tools.ip.is_IP instead.')
 
 
 class IPAddressModuleTestCase(TestIPBase):
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index e49b6ee..6dbc7d0 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8  -*-
 """Test textlib module."""
 #
-# (C) Pywikibot team, 2007-2014
+# (C) Pywikibot team, 2007-2015
 #
 # Distributed under the terms of the MIT license.
 #
@@ -10,8 +10,14 @@
 __version__ = '$Id$'
 
 import codecs
+import functools
 import os
 import re
+
+try:
+    import mwparserfromhell
+except ImportError as e:
+    mwparserfromhell = e
 
 import pywikibot
 import pywikibot.textlib as textlib
@@ -253,11 +259,13 @@
 
     net = False
 
-    def _extract_templates_params(self, func):
+    def _common_results(self, func):
+        """Common cases."""
         self.assertEqual(func('{{a}}'), [('a', OrderedDict())])
         self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
         self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
         self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
+
         self.assertEqual(func('{{a|b=c}}'), [('a', OrderedDict((('b', 'c'), 
)))])
         self.assertEqual(func('{{a|b|c=d}}'), [('a', OrderedDict((('1', 'b'), 
('c', 'd'))))])
         self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
@@ -266,8 +274,11 @@
         self.assertEqual(func('{{a|c=d|1=2}}'), [('a', OrderedDict((('c', 
'd'), ('1', '2'))))])
         self.assertEqual(func('{{a|5=d|a=b}}'), [('a', OrderedDict((('5', 
'd'), ('a', 'b'))))])
         self.assertEqual(func('{{a|=2}}'), [('a', OrderedDict((('', '2'), )))])
+
+        self.assertEqual(func('{{a|}}'), [('a', OrderedDict((('1', ''), )))])
         self.assertEqual(func('{{a|=|}}'), [('a', OrderedDict((('', ''), ('1', 
''))))])
         self.assertEqual(func('{{a||}}'), [('a', OrderedDict((('1', ''), ('2', 
''))))])
+
         self.assertEqual(func('{{a|b={{{1}}}}}'), [('a', OrderedDict((('b', 
'{{{1}}}'), )))])
         self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
                          [('a', OrderedDict((('b', 
'<noinclude>{{{1}}}</noinclude>'), )))])
@@ -279,50 +290,198 @@
         self.assertEqual(func('{{:a|b=c}}'), [(':a', OrderedDict((('b', 'c'), 
)))])
         self.assertEqual(func('{{subst::a|b=c}}'), [('subst::a', 
OrderedDict((('b', 'c'), )))])
 
-    def test_extract_templates_params_mwpfh(self):
-        try:
-            import mwparserfromhell  # noqa
-        except ImportError:
-            raise unittest.SkipTest('mwparserfromhell not available')
+        self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
+                         [('a', OrderedDict((('b', '{{{1}}}'), ('c', 
'{{{2}}}'))))])
+        self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
+                         [('a', OrderedDict((('b', 'c'), ))),
+                          ('d', OrderedDict((('e', 'f'), )))])
 
-        func = textlib.extract_templates_and_params_mwpfh
-        self._extract_templates_params(func)
+        self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
+                         [('a', OrderedDict((('b', '<!--{{{1}}}-->'), )))])
 
-        self.assertEqual(func('{{a|}}'), [('a', OrderedDict((('1', ''), )))])
+        # initial '{' and '}' should be ignored as outer wikitext
+        self.assertEqual(func('{{{a|b}}X}'),
+                         [('a', OrderedDict((('1', 'b'), )))])
 
+    def _etp_regex_differs(self, func):
+        """Common cases not handled the same by ETP_REGEX."""
         self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict(((' b', 'c'), 
)))])
         self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b ', 'c'), 
)))])
         self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', ' c'), 
)))])
         self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c '), 
)))])
 
-        self.assertEqual(func('{{a| b={{c}}}}'), [('a', OrderedDict(((' b', 
'{{c}}'), ))), ('c', OrderedDict())])
-        self.assertEqual(func('{{a|b={{c}}}}'), [('a', OrderedDict((('b', 
'{{c}}'), ))), ('c', OrderedDict())])
-        self.assertEqual(func('{{a|b= {{c}}}}'), [('a', OrderedDict((('b', ' 
{{c}}'), ))), ('c', OrderedDict())])
-        self.assertEqual(func('{{a|b={{c}} }}'), [('a', OrderedDict((('b', 
'{{c}} '), ))), ('c', OrderedDict())])
+        # inner {} should be treated as part of the value
+        self.assertEqual(func('{{a|b={} }}'), [('a', OrderedDict((('b', '{} 
'), )))])
 
-        self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), [('a', 
OrderedDict((('b', '<!--{{{1}}}-->'), )))])
+    def _order_differs(self, func):
+        """Common cases where the order of templates differs."""
+        self.assertCountEqual(func('{{a|b={{c}}}}'),
+                              [('a', OrderedDict((('b', '{{c}}'), ))),
+                               ('c', OrderedDict())])
+
+        self.assertCountEqual(func('{{a|{{c|d}}}}'),
+                              [('c', OrderedDict((('1', 'd'), ))),
+                               ('a', OrderedDict([('1', '{{c|d}}')]))])
+
+        # inner '}' after {{b|c}} should be treated as wikitext
+        self.assertCountEqual(func('{{a|{{b|c}}}|d}}'),
+                              [('a', OrderedDict([('1', '{{b|c}}}'),
+                                                  ('2', u'd')])),
+                               ('b', OrderedDict([('1', 'c')]))])
+
+    def test_extract_templates_params_mwpfh(self):
+        """Test using mwparserfromhell."""
+        if isinstance(mwparserfromhell, ImportError):
+            raise unittest.SkipTest('mwparserfromhell not available')
+
+        func = textlib.extract_templates_and_params_mwpfh
+        self._common_results(func)
+        self._order_differs(func)
+        self._etp_regex_differs(func)
 
     def test_extract_templates_params_regex(self):
-        func = textlib.extract_templates_and_params_regex
-        self._extract_templates_params(func)
+        """Test using many complex regexes."""
+        func = functools.partial(textlib.extract_templates_and_params_regex,
+                                 remove_disabled_parts=False)
+        self._common_results(func)
+        self._order_differs(func)
 
-        self.assertEqual(func('{{a|}}'), [])  # FIXME: this is a bug
+        self.assertEqual(func('{{a|b={} }}'), [])  # FIXME: {} is normal text
 
         self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict((('b', 'c'), 
)))])
         self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b', 'c'), 
)))])
         self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', 'c'), 
)))])
         self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c'), 
)))])
 
-        self.assertEqual(func('{{a| b={{c}}}}'), [('c', OrderedDict()), ('a', 
OrderedDict((('b', '{{c}}'), )))])
-        self.assertEqual(func('{{a|b={{c}}}}'), [('c', OrderedDict()), ('a', 
OrderedDict((('b', '{{c}}'), )))])
-        self.assertEqual(func('{{a|b= {{c}}}}'), [('c', OrderedDict()), ('a', 
OrderedDict((('b', '{{c}}'), )))])
-        self.assertEqual(func('{{a|b={{c}} }}'), [('c', OrderedDict()), ('a', 
OrderedDict((('b', '{{c}}'), )))])
-
-        self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), [('a', 
OrderedDict((('b', ''), )))])
+        func = textlib.extract_templates_and_params_regex
+        self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
+                         [('a', OrderedDict((('b', ''), )))])
 
     def test_extract_templates_params(self):
-        self._extract_templates_params(
+        """Test that the normal entry point works."""
+        self._common_results(
             textlib.extract_templates_and_params)
+
+    def test_template_simple_regex(self):
+        """Test using simple regex."""
+        func = textlib.extract_templates_and_params_regex_simple
+        self._common_results(func)
+        self._etp_regex_differs(func)
+
+        # The simple regex copies the whitespace of mwpfh, but does
+        # not have additional entries for nested templates.
+        self.assertEqual(func('{{a| b={{c}}}}'),
+                         [('a', OrderedDict(((' b', '{{c}}'), )))])
+        self.assertEqual(func('{{a|b={{c}}}}'),
+                         [('a', OrderedDict((('b', '{{c}}'), )))])
+        self.assertEqual(func('{{a|b= {{c}}}}'),
+                         [('a', OrderedDict((('b', ' {{c}}'), )))])
+        self.assertEqual(func('{{a|b={{c}} }}'),
+                         [('a', OrderedDict((('b', '{{c}} '), )))])
+
+        # These three are from _order_differs, and while the first works
+        self.assertEqual(func('{{a|{{c}} }}'),
+                         [('a', OrderedDict((('1', '{{c}} '), )))])
+
+        # an inner '|' causes extract_template_and_params_regex_simple to
+        # split arguments incorrectly in the next two cases.
+        self.assertEqual(func('{{a|{{c|d}} }}'),
+                         [('a', OrderedDict([('1', '{{c'),
+                                             ('2', 'd}} ')]))])
+
+        self.assertEqual(func('{{a|{{b|c}}}|d}}'),
+                         [(u'a', OrderedDict([('1', u'{{b'),
+                                              ('2', u'c}}}'),
+                                              ('3', u'd')]))])
+
+    def test_regexes(self):
+        """_ETP_REGEX, NESTED_TEMPLATE_REGEX and TEMP_REGEX tests."""
+        func = textlib._ETP_REGEX.search
+
+        self.assertIsNotNone(func('{{{1}}}'))
+        self.assertIsNotNone(func('{{a|b={{{1}}} }}'))
+        self.assertIsNotNone(func('{{a|b={{c}} }}'))
+        self.assertIsNotNone(func('{{a|b={{c}} }}'))
+        self.assertIsNotNone(func('{{a|b={{c|d=1}} }}'))
+
+        self.assertIsNotNone(func('{{a|{{c}} }}'))
+        self.assertIsNotNone(func('{{a|{{c|d}} }}'))
+
+        func = textlib._ETP_REGEX.match
+
+        self.assertIsNone(func('{{{1}}}'))
+
+        self.assertIsNotNone(func('{{#if:foo}}'))
+        self.assertIsNotNone(func('{{foo:}}'))
+
+        self.assertIsNotNone(func('{{CURRENTYEAR}}'))
+        self.assertIsNotNone(func('{{1}}'))
+
+        self.assertIsNone(func('{{a|b={{CURRENTYEAR}} }}'))
+        self.assertIsNone(func('{{a|b={{{1}}} }}'))
+        self.assertIsNone(func('{{a|b={{c}} }}'))
+        self.assertIsNone(func('{{a|b={{c|d=1}} }}'))
+        self.assertIsNone(func('{{a|b={} }}'))
+        self.assertIsNone(func('{{:a|b={{c|d=1}} }}'))
+
+        self.assertIsNone(func('{{a|{{c}} }}'))
+        self.assertIsNone(func('{{a|{{c|d}} }}'))
+
+        func = textlib.TEMP_REGEX.search
+
+        self.assertIsNotNone(func('{{{1}}}'))
+        self.assertIsNotNone(func('{{a|b={{c}} }}'))
+        self.assertIsNotNone(func('{{a|b={{c|d=1}} }}'))
+        self.assertIsNotNone(func('{{a|{{c}} }}'))
+        self.assertIsNotNone(func('{{a|{{c|d}} }}'))
+
+        func = textlib.TEMP_REGEX.match
+
+        self.assertIsNotNone(func('{{#if:foo}}'))
+        self.assertIsNotNone(func('{{foo:}}'))
+
+        self.assertIsNotNone(func('{{CURRENTYEAR}}'))
+        self.assertIsNotNone(func('{{1}}'))
+
+        self.assertIsNotNone(func('{{a|b={{CURRENTYEAR}} }}'))
+        self.assertIsNotNone(func('{{a|b={{{1}}} }}'))
+
+        self.assertIsNone(func('{{a|b={{c}} }}'))
+        self.assertIsNone(func('{{a|b={{c|d=1}} }}'))
+        self.assertIsNotNone(func('{{a|b={} }}'))
+        self.assertIsNone(func('{{:a|b={{c|d=1}} }}'))
+
+        self.assertIsNone(func('{{a|{{c}} }}'))
+        self.assertIsNone(func('{{a|{{c|d}} }}'))
+
+        func = textlib.NESTED_TEMPLATE_REGEX.search
+
+        # Numerically named templates are rejected
+        self.assertIsNone(func('{{1}}'))
+
+        self.assertIsNone(func('{{#if:foo}}'))
+        self.assertIsNone(func('{{{1}}}'))
+        self.assertIsNone(func('{{{1|}}}'))
+        self.assertIsNone(func('{{{15|a}}}'))
+        self.assertIsNone(func('{{{1|{{{2|a}}} }}}'))
+
+        self.assertIsNone(func('{{{1|{{2|a}} }}}'))
+
+        func = textlib.NESTED_TEMPLATE_REGEX.match
+
+        self.assertIsNotNone(func('{{CURRENTYEAR}}'))
+        self.assertIsNotNone(func('{{foo:bar}}'))
+        self.assertIsNone(func('{{1}}'))
+
+        self.assertIsNotNone(func('{{a|b={{CURRENTYEAR}} }}'))
+        self.assertIsNotNone(func('{{a|b={{{1}}} }}'))
+        self.assertIsNotNone(func('{{a|b={{c}} }}'))
+        self.assertIsNotNone(func('{{a|b={{c|d=1}} }}'))
+        self.assertIsNotNone(func('{{a|b={} }}'))
+        self.assertIsNotNone(func('{{:a|b={{c|d=1}} }}'))
+
+        self.assertIsNotNone(func('{{a|{{c}} }}'))
+        self.assertIsNotNone(func('{{a|{{c|d}} }}'))
 
 
 class TestReplaceLinks(TestCase):
@@ -748,10 +907,26 @@
                          '[[ey:y]]')  # "ex" is not a valid interwiki code
 
     def test_replace_template(self):
-        template_sample = r'{{templatename | url= | accessdate={{Fecha|1993}} 
|atitle=The [[real title]] }}'
+        template_sample = (r'a {{templatename '
+                           r'    | accessdate={{Fecha|1993}} '
+                           r'    |atitle=The [[real title]] }}')
         self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
                                                ['template'], site=self.site),
-                         template_sample)
+                         'X' + template_sample[1:])
+
+        template_sample = (r'a {{templatename '
+                           r'    | 1={{a}}2{{a}} '
+                           r'    | 2={{a}}1{{a}} }}')
+        self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
+                                               ['template'], site=self.site),
+                         'X' + template_sample[1:])
+
+        template_sample = (r'a {{templatename '
+                           r'    | 1={{{a}}}2{{{a}}} '
+                           r'    | 2={{{a}}}1{{{a}}} }}')
+        self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
+                                               ['template'], site=self.site),
+                         'X' + template_sample[1:])
 
     def test_replace_source_reference(self):
         """Test replacing in text which contains back references."""

-- 
To view, visit https://gerrit.wikimedia.org/r/190143
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Iab46aaafac3c1367a68c900bcb33217cf67f1126
Gerrit-PatchSet: 24
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: Eranroz <eranro...@gmail.com>
Gerrit-Reviewer: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com>
Gerrit-Reviewer: Ricordisamoa <ricordisa...@openmailbox.org>
Gerrit-Reviewer: XZise <commodorefabia...@gmx.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Introduce new tested template regex - change (pywikibot/core)

Reply via email to