jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/675351 )

Change subject: [IMPR] Enable extract_templates_and_params with wikitextparser 
package
......................................................................

[IMPR] Enable extract_templates_and_params with wikitextparser package

- rename extract_templates_and_params_mwpfh to
  _extract_templates_and_params_parser and check for the currently used
  wikitext parser. All different package behaviours ar handled there.
  Test for ImportError instead of generic Exception.
- deprecate extract_templates_and_params_mwpfh but force to take
  mwparserfromhell as wikitextparser
- the fallback path is: wikitextparser, mwparserfromhell, _ETP_REGEX
- update documentation
- add OrderedDict and nullcontext to backports.py
- update tests accordingly

Change-Id: Ibf4472b309672e7189042875c460b3665ec810ed
---
M pywikibot/backports.py
M pywikibot/textlib.py
M scripts/i18n
M tests/textlib_tests.py
4 files changed, 211 insertions(+), 101 deletions(-)

Approvals:
  JJMC89: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/backports.py b/pywikibot/backports.py
index 487447a..32e2666 100644
--- a/pywikibot/backports.py
+++ b/pywikibot/backports.py
@@ -14,6 +14,25 @@
     cache = _lru_cache(None)


+# context
+if PYTHON_VERSION < (3, 7):
+
+    class nullcontext:  # noqa: N801
+
+        """Dummy context manager for Python 3.5/3.6 that does nothing."""
+
+        def __init__(self, result=None):  # noqa: D107
+            self.result = result
+
+        def __enter__(self):
+            return self.result
+
+        def __exit__(self, *args):
+            pass
+else:
+    from contextlib import nullcontext
+
+
 # typing
 if PYTHON_VERSION < (3, 5, 2):
     from typing import Dict as DefaultDict
@@ -22,6 +41,15 @@
 else:
     from collections import defaultdict as DefaultDict  # noqa: N812

+
+if PYTHON_VERSION < (3, 7, 2):
+    from typing import Dict as OrderedDict
+elif PYTHON_VERSION < (3, 9):
+    from typing import OrderedDict
+else:
+    from collections import OrderedDict
+
+
 if PYTHON_VERSION >= (3, 9):
     from collections.abc import Iterable, Sequence
     Dict = dict
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 204bd68..75ae916 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -22,15 +22,22 @@
 import pywikibot

 from pywikibot.backports import List, Tuple
+from pywikibot.backports import OrderedDict as OrderedDictType
 from pywikibot.exceptions import InvalidTitle, SiteDefinitionError
 from pywikibot.family import Family
-from pywikibot.tools import deprecate_arg, issue_deprecation_warning
+from pywikibot.tools import (
+    deprecated, deprecate_arg, issue_deprecation_warning,
+)
 
 try:
-    import mwparserfromhell
-except ImportError as e:
-    mwparserfromhell = e
+    import wikitextparser
+except ImportError:
+    try:
+        import mwparserfromhell as wikitextparser
+    except ImportError as e:
+        wikitextparser = e

+ETPType = List[Tuple[str, OrderedDictType[str, str]]]

 # cache for replaceExcept to avoid recompile or regexes each call
 _regex_cache = {}
@@ -1566,8 +1573,9 @@
 # Functions dealing with templates
 # --------------------------------

-def extract_templates_and_params(text: str, remove_disabled_parts=None,
-                                 strip: Optional[bool] = None):
+def extract_templates_and_params(text: str,
+                                 remove_disabled_parts: Optional[bool] = None,
+                                 strip: Optional[bool] = None) -> ETPType:
     """Return a list of templates found in text.

     Return value is a list of tuples. There is one tuple for each use of a
@@ -1596,16 +1604,15 @@

     @param text: The wikitext from which templates are extracted
     @param remove_disabled_parts: Remove disabled wikitext such as comments
-        and pre. If None (default), this is enabled when mwparserfromhell
-        is not available and disabled if mwparserfromhell is present.
-    @type remove_disabled_parts: bool or None
+        and pre. If None (default), this is enabled when neither
+        mwparserfromhell not wikitextparser package is available and
+        disabled otherwise.
     @param strip: if enabled, strip arguments and values of templates.
-        If None (default), this is enabled when mwparserfromhell
-        is not available and disabled if mwparserfromhell is present.
+        If None (default), this is enabled when neither mwparserfromhell
+        nor wikitextparser package is available and disabled otherwise.
     @return: list of template name and params
-    @rtype: list of tuple
     """
-    use_regex = isinstance(mwparserfromhell, Exception)
+    use_regex = isinstance(wikitextparser, ImportError)

     if remove_disabled_parts is None:
         remove_disabled_parts = use_regex
@@ -1617,41 +1624,58 @@

     if use_regex:
         return extract_templates_and_params_regex(text, False, strip)
-    return extract_templates_and_params_mwpfh(text, strip)
+    return _extract_templates_and_params_parser(text, strip)


-def extract_templates_and_params_mwpfh(text: str, strip=False):
+def _extract_templates_and_params_parser(text: str,
+                                         strip: bool = False) -> ETPType:
     """
     Extract templates with params using mwparserfromhell.

     This function should not be called directly.

-    Use extract_templates_and_params, which will select this
-    mwparserfromhell implementation if based on whether the
-    mwparserfromhell package is installed.
+    Use extract_templates_and_params, which will select this parser
+    implementation if the mwparserfromhell or wikitextparser package is
+    installed.

     @param text: The wikitext from which templates are extracted
+    @param strip: if enabled, strip arguments and values of templates
     @return: list of template name and params
-    @rtype: list of tuple
     """
-    code = mwparserfromhell.parse(text)
-    result = []
+    def explicit(param):
+        try:
+            attr = param.showkey
+        except AttributeError:
+            attr = not param.positional
+        return attr
 
-    for template in code.ifilter_templates(
+    parser_name = wikitextparser.__name__
+    pywikibot.log('Using {!r} wikitext parser'.format(parser_name))
+
+    result = []
+    parsed = wikitextparser.parse(text)
+    if parser_name == 'wikitextparser':
+        templates = parsed.templates
+        arguments = 'arguments'
+    else:
+        templates = parsed.ifilter_templates(
             matches=lambda x: not x.name.lstrip().startswith('#'),
-            recursive=True):
+            recursive=True)
+        arguments = 'params'
+
+    for template in templates:
         params = OrderedDict()
-        for param in template.params:
+        for param in getattr(template, arguments):
+            value = str(param.value)  # mwpfh needs upcast to str
+
             if strip:
-                implicit_parameter = not param.showkey
                 key = param.name.strip()
-                if not implicit_parameter:
+                if explicit(param):
                     value = param.value.strip()
                 else:
                     value = str(param.value)
             else:
                 key = str(param.name)
-                value = str(param.value)

             params[key] = value

@@ -1659,9 +1683,22 @@
     return result


+@deprecated('extract_templates_and_params', since='20210329',
+            future_warning=True)
+def extract_templates_and_params_mwpfh(text: str,
+                                       strip: bool = False) -> ETPType:
+    """Extract templates with params using mwparserfromhell."""
+    global wikitextparser
+    saved_parser = wikitextparser
+    import mwparserfromhell as wikitextparser
+    result = _extract_templates_and_params_parser(text, strip)
+    wikitextparser = saved_parser
+    return result
+
+
 def extract_templates_and_params_regex(text: str,
                                        remove_disabled_parts: bool = True,
-                                       strip: bool = True):
+                                       strip: bool = True) -> ETPType:
     """
     Extract templates with params using a regex with additional processing.

@@ -1672,8 +1709,8 @@
     is not used.

     @param text: The wikitext from which templates are extracted
+    @param strip: if enabled, strip arguments and values of templates
     @return: list of template name and params
-    @rtype: list of tuple
     """
     # remove commented-out stuff etc.
     if remove_disabled_parts:
diff --git a/scripts/i18n b/scripts/i18n
index 362e42e..4591fe6 160000
--- a/scripts/i18n
+++ b/scripts/i18n
@@ -1 +1 @@
-Subproject commit 362e42e36ae45005e8019ae4107eac2adb736a55
+Subproject commit 4591fe605f89e29e14f730f1416542acfea135bd
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 139cb99..8b537b3 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -16,8 +16,10 @@
 import pywikibot
 import pywikibot.textlib as textlib

+from pywikibot.backports import nullcontext
 from pywikibot.site._interwikimap import _IWEntry
 from pywikibot.textlib import _MultiTemplateMatchBuilder, extract_sections
+from pywikibot.tools import suppress_warnings
 from pywikibot import UnknownSite

 from tests.aspects import (
@@ -300,6 +302,10 @@
                 'Invalid category title extracted: nasty{{{!}}')
 

+WARNING_MSG = (r'.*extract_templates_and_params_mwpfh .*'
+               r'is deprecated for .*; use extract_templates_and_params')
+
+
 class TestTemplateParams(TestCase):

     """Test to verify that template params extraction works."""
@@ -308,67 +314,61 @@

     def _common_results(self, func):
         """Common cases."""
-        self.assertEqual(func('{{a}}'), [('a', OrderedDict())])
-        self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
-        self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
-        self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
+        with suppress_warnings(WARNING_MSG, category=FutureWarning):
+            self.assertEqual(func('{{a}}'), [('a', OrderedDict())])

-        self.assertEqual(func('{{a|b=c}}'),
-                         [('a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{a|b|c=d}}'),
-                         [('a', OrderedDict((('1', 'b'), ('c', 'd'))))])
-        self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
-                         [('a', OrderedDict((('b', 'c'), ('f', 'g'),
-                                             ('d', 'e'), ('1', ''))))])
-        self.assertEqual(func('{{a|1=2|c=d}}'),
-                         [('a', OrderedDict((('1', '2'), ('c', 'd'))))])
-        self.assertEqual(func('{{a|c=d|1=2}}'),
-                         [('a', OrderedDict((('c', 'd'), ('1', '2'))))])
-        self.assertEqual(func('{{a|5=d|a=b}}'),
-                         [('a', OrderedDict((('5', 'd'), ('a', 'b'))))])
-        self.assertEqual(func('{{a|=2}}'),
-                         [('a', OrderedDict((('', '2'), )))])
+        with suppress_warnings(WARNING_MSG, category=FutureWarning):
+            self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
+            self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
+            self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
+            self.assertEqual(func('{{a|b=c}}'),
+                             [('a', OrderedDict((('b', 'c'), )))])
+            self.assertEqual(func('{{a|b|c=d}}'),
+                             [('a', OrderedDict((('1', 'b'), ('c', 'd'))))])
+            self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
+                             [('a', OrderedDict((('b', 'c'), ('f', 'g'),
+                                                 ('d', 'e'), ('1', ''))))])
+            self.assertEqual(func('{{a|1=2|c=d}}'),
+                             [('a', OrderedDict((('1', '2'), ('c', 'd'))))])
+            self.assertEqual(func('{{a|c=d|1=2}}'),
+                             [('a', OrderedDict((('c', 'd'), ('1', '2'))))])
+            self.assertEqual(func('{{a|5=d|a=b}}'),
+                             [('a', OrderedDict((('5', 'd'), ('a', 'b'))))])
+            self.assertEqual(func('{{a|=2}}'),
+                             [('a', OrderedDict((('', '2'), )))])
+            self.assertEqual(func('{{a|}}'),
+                             [('a', OrderedDict((('1', ''), )))])
+            self.assertEqual(func('{{a|=|}}'),
+                             [('a', OrderedDict((('', ''), ('1', ''))))])
+            self.assertEqual(func('{{a||}}'),
+                             [('a', OrderedDict((('1', ''), ('2', ''))))])
+            self.assertEqual(func('{{a|b={{{1}}}}}'),
+                             [('a', OrderedDict((('b', '{{{1}}}'), )))])
+            self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
+                             [('a',
+                               OrderedDict((('b',
+                                             '<noinclude>{{{1}}}</noinclude>'),
+                                            )))])
+            self.assertEqual(func('{{Template:a|b=c}}'),
+                             [('Template:a', OrderedDict((('b', 'c'), )))])
+            self.assertEqual(func('{{template:a|b=c}}'),
+                             [('template:a', OrderedDict((('b', 'c'), )))])
+            self.assertEqual(func('{{:a|b=c}}'),
+                             [(':a', OrderedDict((('b', 'c'), )))])
+            self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
+                             [('a', OrderedDict((('b', '{{{1}}}'),
+                                                 ('c', '{{{2}}}'))))])
+            self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
+                             [('a', OrderedDict((('b', 'c'), ))),
+                              ('d', OrderedDict((('e', 'f'), )))])

-        self.assertEqual(func('{{a|}}'), [('a', OrderedDict((('1', ''), )))])
-        self.assertEqual(func('{{a|=|}}'),
-                         [('a', OrderedDict((('', ''), ('1', ''))))])
-        self.assertEqual(func('{{a||}}'),
-                         [('a', OrderedDict((('1', ''), ('2', ''))))])
+            # initial '{' and '}' should be ignored as outer wikitext
+            self.assertEqual(func('{{{a|b}}X}'),
+                             [('a', OrderedDict((('1', 'b'), )))])

-        self.assertEqual(func('{{a|b={{{1}}}}}'),
-                         [('a', OrderedDict((('b', '{{{1}}}'), )))])
-        self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
-                         [('a', OrderedDict(
-                             (('b', '<noinclude>{{{1}}}</noinclude>'), )))])
-        self.assertEqual(func('{{subst:a|b=c}}'),
-                         [('subst:a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{safesubst:a|b=c}}'),
-                         [('safesubst:a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{msgnw:a|b=c}}'),
-                         [('msgnw:a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{Template:a|b=c}}'),
-                         [('Template:a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{template:a|b=c}}'),
-                         [('template:a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{:a|b=c}}'),
-                         [(':a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{subst::a|b=c}}'),
-                         [('subst::a', OrderedDict((('b', 'c'), )))])
-
-        self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
-                         [('a', OrderedDict((('b', '{{{1}}}'),
-                                             ('c', '{{{2}}}'))))])
-        self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
-                         [('a', OrderedDict((('b', 'c'), ))),
-                          ('d', OrderedDict((('e', 'f'), )))])
-
-        # initial '{' and '}' should be ignored as outer wikitext
-        self.assertEqual(func('{{{a|b}}X}'),
-                         [('a', OrderedDict((('1', 'b'), )))])
-
-        # sf.net bug 1575: unclosed template
-        self.assertEqual(func('{{a'), [])
-        self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
+            # sf.net bug 1575: unclosed template
+            self.assertEqual(func('{{a'), [])
+            self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])

     def _unstripped(self, func):
         """Common cases of unstripped results."""
@@ -455,14 +455,69 @@
                                                   ('2', 'd')])),
                                ('b', OrderedDict([('1', 'c')]))])

+    def _mwpfh_passes(self, func, failing=False):
+        """Common cases failing with wikitextparser but passes with mwpfh.
+
+        Probably the behaviour of regex or mwpfh is wrong.
+        """
+        patterns = [
+            '{{subst:a|b=c}}',
+            '{{safesubst:a|b=c}}',
+            '{{msgnw:a|b=c}}',
+            '{{subst::a|b=c}}'
+        ]
+        context = self.assertRaises(AssertionError) \
+            if failing else nullcontext()
+
+        for template in patterns:
+            with self.subTest(template=template, failing=failing):
+                name = template.strip('{}').split('|')[0]
+                with context:
+                    self.assertEqual(func(template),
+                                     [(name, OrderedDict((('b', 'c'), )))])
+
     @require_modules('mwparserfromhell')
     def test_extract_templates_params_mwpfh(self):
         """Test using mwparserfromhell."""
         func = textlib.extract_templates_and_params_mwpfh
+        with suppress_warnings(WARNING_MSG, category=FutureWarning):
+            self._common_results(func)
+            self._order_differs(func)
+            self._unstripped(func)
+            self._etp_regex_differs(func)
+            self._mwpfh_passes(func)
+
+            self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+                                  [('c', OrderedDict((('1', '{{d}}'), ))),
+                                   ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
+                                   ('d', OrderedDict())
+                                   ])
+
+            self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+                                  [('c', OrderedDict((('1', '{{d|}}'), ))),
+                                   ('a', OrderedDict([('1', '{{c|{{d|}}}}')])),
+                                   ('d', OrderedDict([('1', '')]))
+                                   ])
+
+    @require_modules('mwparserfromhell')
+    def test_extract_templates_params_parser_stripped(self):
+        """Test using mwparserfromhell with stripping."""
+        func = functools.partial(textlib._extract_templates_and_params_parser,
+                                 strip=True)
+
+        self._common_results(func)
+        self._order_differs(func)
+        self._stripped(func)
+
+    @require_modules('wikitextparser')
+    def test_extract_templates_params_parser(self):
+        """Test using wikitextparser."""
+        func = textlib.extract_templates_and_params
         self._common_results(func)
         self._order_differs(func)
         self._unstripped(func)
         self._etp_regex_differs(func)
+        self._mwpfh_passes(func, failing=True)

         self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
                               [('c', OrderedDict((('1', '{{d}}'), ))),
@@ -476,16 +531,6 @@
                                ('d', OrderedDict([('1', '')]))
                                ])

-    @require_modules('mwparserfromhell')
-    def test_extract_templates_params_mwpfh_stripped(self):
-        """Test using mwparserfromhell with stripping."""
-        func = functools.partial(textlib.extract_templates_and_params_mwpfh,
-                                 strip=True)
-
-        self._common_results(func)
-        self._order_differs(func)
-        self._stripped(func)
-
     def test_extract_templates_params_regex(self):
         """Test using many complex regexes."""
         func = functools.partial(textlib.extract_templates_and_params_regex,
@@ -660,7 +705,7 @@

     net = False

-    @PatchingTestCase.patched(textlib, 'extract_templates_and_params_mwpfh')
+    @PatchingTestCase.patched(textlib, '_extract_templates_and_params_parser')
     def extract_mwpfh(self, text, *args, **kwargs):
         """Patched call to extract_templates_and_params_mwpfh."""
         self._text = text
@@ -676,7 +721,7 @@

     def test_removing_disabled_parts_regex(self):
         """Test removing disabled parts when using the regex variant."""
-        self.patch(textlib, 'mwparserfromhell', Exception())
+        self.patch(textlib, 'wikitextparser', ImportError())
         textlib.extract_templates_and_params('{{a<!-- -->}}', True)
         self.assertEqual(self._text, '{{a}}')
         self.assertFalse(self._mwpfh)
@@ -702,7 +747,7 @@

     def test_strip_regex(self):
         """Test stripping values when using the regex variant."""
-        self.patch(textlib, 'mwparserfromhell', Exception())
+        self.patch(textlib, 'wikitextparser', ImportError())
         textlib.extract_templates_and_params('{{a| foo }}', False, True)
         self.assertEqual(self._args, (False, True))
         self.assertFalse(self._mwpfh)

--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/675351
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ibf4472b309672e7189042875c460b3665ec810ed
Gerrit-Change-Number: 675351
Gerrit-PatchSet: 15
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: Dalba <[email protected]>
Gerrit-Reviewer: JJMC89 <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to