jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/938357 )

Change subject: [FIX] Make header extraction more robust
......................................................................

[FIX] Make header extraction more robust

Bug: T341787
Change-Id: I7799317194cd76b25ae56d2ccc4f06434b1b4987
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 47 insertions(+), 6 deletions(-)

Approvals:
  Meno25: Looks good to me, but someone else must approve
  Xqt: Looks good to me, approved
  jenkins-bot: Verified




diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index a1fee36..39f8d50 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -270,7 +270,7 @@
         # section headers
         'header': re.compile(
             r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*'
-            r'=(?:[^\n]|<!--[\s\S]*?-->)+='
+            r'(=(?:[^\n]|<!--[\s\S]*?-->)+=)'
             r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
         # external links
         'hyperlink': compileLinkR(),
@@ -933,7 +933,7 @@
 # -------------------------------

 #: Head pattern
-HEAD_PATTERN = re.compile('{0}[^=]+{0}'.format('(={1,6})'))
+HEAD_PATTERN = re.compile(r'(={1,6}).+\1', re.DOTALL)
 TITLE_PATTERN = re.compile("'{3}([^']+)'{3}")

 _Heading = namedtuple('_Heading', ('text', 'start', 'end'))
@@ -957,7 +957,7 @@
         .. versionadded:: 8.2
         """
         m = HEAD_PATTERN.match(self.title)
-        return min(map(len, m.groups()))
+        return len(m[1])

     @property
     def heading(self) -> str:
@@ -998,9 +998,9 @@
     headings = []
     heading_regex = get_regexes('header')[0]
     for match in heading_regex.finditer(text):
-        start, end = match.span()
+        start, end = match.span(1)
         if not isDisabled(text, start) and not isDisabled(text, end):
-            headings.append(_Heading(match.group(), start, end))
+            headings.append(_Heading(match[1], start, end))
     return headings


diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 6eeff21..b0a642b 100755
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1626,6 +1626,24 @@
             [('====title====', '\n'), ('==title 2==', '\ncontent')],
         )

+    def test_with_comments(self):
+        """Test section headers surrounded by comments."""
+        text = ('text\n\n'
+                '<!--\n multiline comment\n-->== title ==\n'
+                'content\n\n'
+                '<!-- comment --> == not title ==\n'
+                'foo\n\n'
+                '== title 2 == <!-- trailing comment -->\n'
+                'content 2')
+        result = extract_sections(text, self.site)
+        self._extract_sections_tests(
+            result,
+            'text\n\n<!--\n multiline comment\n-->',
+            [('== title ==',
+              '\ncontent\n\n<!-- comment --> == not title ==\nfoo\n\n'),
+             ('== title 2 ==', ' <!-- trailing comment -->\ncontent 2')]
+        )
+
     def test_long_comment(self):
         r"""Test for text having a long expanse of white space.

@@ -1640,8 +1658,21 @@
         result = extract_sections(text, self.site)
         self._extract_sections_tests(result, text, [], '')

+    def test_empty_header(self):
+        """Test empty section headers."""
+        text = ('text\n\n'
+                '== ==\n'
+                '=====\n'
+                '=== ===\n')
+        result = extract_sections(text, self.site)
+        self._extract_sections_tests(
+            result,
+            'text\n\n',
+            [('== ==', '\n'), ('=====', '\n'), ('=== ===', '\n')]
+        )
+
     def test_unbalanced_headers(self):
-        """Test unbalances section headers."""
+        """Test unbalanced section headers."""
         text = ('text\n\n'
                 '====title===\n'
                 '==title 2===\n'

--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/938357
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I7799317194cd76b25ae56d2ccc4f06434b1b4987
Gerrit-Change-Number: 938357
Gerrit-PatchSet: 6
Gerrit-Owner: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Meno25 <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to