[MediaWiki-commits] [Gerrit] Move internal link regex to textlib - change (pywikibot/core)

John Vandenberg (Code Review) Tue, 21 Apr 2015 22:42:10 -0700

John Vandenberg has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/205818


Change subject: Move internal link regex to textlib
......................................................................

Move internal link regex to textlib

Change-Id: If72ae65a708b31d54ffd9e08408d5692fd3e0982
---
M pywikibot/textlib.py
M scripts/cosmetic_changes.py
M scripts/disambredir.py
M scripts/fixing_redirects.py
M scripts/selflink.py
M scripts/solve_disambiguation.py
M scripts/unlink.py
7 files changed, 35 insertions(+), 58 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/18/205818/1

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 4c2da87..efb5c9b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -36,6 +36,25 @@
 TEMP_REGEX = re.compile(
     
r'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
 
+# The regular expression which finds links. Results consist of five
+# groups:
+# 1. <newline> depends whether the links starts with a new line.
+# 2. <title> is the target page title, everything before | or ].
+# 3. <section> is the page section. It'll include the # to make life
+#    easier for us.
+# 4. <label> is the alternative link title between | and ].
+# 5. <linktrail> is the link trail after ]] which are part of the word.
+# Note: the definition of 'letter' varies from language to language.
+# Note: site.linktrail must be substituted into this regex, and flags=re.X
+#       used to as it uses whitespace mode.
+INTERNAL_LINK_REGEX = r"""
+    (?P<newline>[\n]*)
+    \[\[  (?P<title>     [^\[\]\|#]*)
+          (?P<section> \#[^\]\|]*)?
+       (\|(?P<label>     [^\]]*))?
+    \]\]  (?P<linktrail>%s)
+"""
+
 NON_LATIN_DIGITS = {
     'ckb': u'٠١٢٣٤٥٦٧٨٩',
     'fa': u'۰۱۲۳۴۵۶۷۸۹',
diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py
index bb99a30..d0dab20 100755
--- a/scripts/cosmetic_changes.py
+++ b/scripts/cosmetic_changes.py
@@ -441,7 +441,10 @@
         # helper function which works on one link and either returns it
         # unmodified, or returns a replacement.
         def handleOneLink(match):
-            titleWithSection = match.group('titleWithSection')
+            titleWithSection = match.group('title')
+            if match.group('section'):
+                titleWithSection += match.group('section')
+
             label = match.group('label')
             trailingChars = match.group('linktrail')
             newline = match.group('newline')
@@ -546,16 +549,7 @@
             return match.group()
 
         trailR = re.compile(self.site.linktrail())
-    # The regular expression which finds links. Results consist of four groups:
-    # group <newline> depends whether the links starts with a new line.
-    # group <titleWithSection> is the page title and section, that is,
-    # everything before | or ]. It'll include the # to make life easier for us.
-    # group <label> is the alternative link title between | and ].
-    # group <linktrail> is the link trail after ]] which are part of the word.
-    # note that the definition of 'letter' varies from language to language.
-        linkR = re.compile(
-            
r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>'
 +
-            self.site.linktrail() + ')')
+        linkR = re.compile(textlib.INTERNAL_LINK % self.site.linktrail(), re.X)
 
         text = textlib.replaceExcept(text, linkR, handleOneLink,
                                      ['comment', 'math', 'nowiki', 'pre',
diff --git a/scripts/disambredir.py b/scripts/disambredir.py
index fe5bd49..5eae272 100755
--- a/scripts/disambredir.py
+++ b/scripts/disambredir.py
@@ -20,8 +20,10 @@
 __version__ = '$Id$'
 #
 import re
+
 import pywikibot
-from pywikibot import i18n, pagegenerators
+
+from pywikibot import i18n, pagegenerators, textlib
 from pywikibot.tools import first_lower, first_upper as firstcap
 
 msg = {
@@ -45,9 +47,7 @@
     # make a backup of the original text so we can show the changes later
     mysite = pywikibot.Site()
     linktrail = mysite.linktrail()
-    linkR = re.compile(
-        
r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>%s)'
-        % linktrail)
+    linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
     curpos = 0
     # This loop will run until we have finished the current page
     while True:
diff --git a/scripts/fixing_redirects.py b/scripts/fixing_redirects.py
index ebfb555..1624a62 100755
--- a/scripts/fixing_redirects.py
+++ b/scripts/fixing_redirects.py
@@ -25,9 +25,10 @@
 #
 import re
 import sys
+
 import pywikibot
-from pywikibot import pagegenerators
-from pywikibot import i18n
+
+from pywikibot import i18n, pagegenerators, textlib
 from pywikibot.tools import first_lower, first_upper as firstcap
 
 # This is required for the text that is shown when you run this script
@@ -65,8 +66,7 @@
     linktrail = mysite.linktrail()
 
     # make a backup of the original text so we can show the changes later
-    linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?'
-                       r'(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail 
+ ')')
+    linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
     curpos = 0
     # This loop will run until we have finished the current page
     while True:
diff --git a/scripts/selflink.py b/scripts/selflink.py
index bcd284c..6ff897c 100755
--- a/scripts/selflink.py
+++ b/scripts/selflink.py
@@ -39,22 +39,7 @@
         super(SelflinkBot, self).__init__(**kwargs)
         self.generator = generator
         linktrail = pywikibot.Site().linktrail()
-        # The regular expression which finds links. Results consist of four
-        # groups:
-        # group title is the target page title, everything before | or ].
-        # group section is the page section. It'll include the # to make life
-        # easier for us.
-        # group label is the alternative link title, that's everything between
-        # | and ].
-        # group linktrail is the link trail, that's letters after ]] which are
-        # part of the word.
-        # note that the definition of 'letter' varies from language to
-        # language.
-        self.linkR = re.compile(
-            r'\[\[(?P<title>[^\]\|#]*)'
-            r'(?P<section>#[^\]\|]*)?'
-            r'(\|(?P<label>[^\]]*))?\]\]'
-            r'(?P<linktrail>' + linktrail + ')')
+        self.linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
 
     def handleNextLink(self, page, match, context=100):
         """Process the next link on a page, offering the user choices.
diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index b91f0b3..f41bc05 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -535,12 +535,7 @@
         # group linktrail is the link trail, that's letters after ]] which
         # are part of the word.
         # note that the definition of 'letter' varies from language to 
language.
-        self.linkR = re.compile(r'''
-            \[\[  (?P<title>     [^\[\]\|#]*)
-                  (?P<section> \#[^\]\|]*)?
-               (\|(?P<label>     [^\]]*))?  \]\]
-            (?P<linktrail>%s)''' % linktrail,
-                                flags=re.X)
+        self.linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
 
     def treat(self, refPage, disambPage):
         """
diff --git a/scripts/unlink.py b/scripts/unlink.py
index 103a9a0..8832a91 100755
--- a/scripts/unlink.py
+++ b/scripts/unlink.py
@@ -53,23 +53,7 @@
 
         self.generator = pageToUnlink.getReferences(
             namespaces=self.getOption('namespaces'), content=True)
-        # The regular expression which finds links. Results consist of four
-        # groups:
-        #
-        # group title is the target page title, that is, everything
-        # before | or ].
-        #
-        # group section is the page section.
-        # It'll include the # to make life easier for us.
-        #
-        # group label is the alternative link title, that's everything
-        # between | and ].
-        #
-        # group linktrail is the link trail, that's letters after ]] which are
-        # part of the word.
-        # note that the definition of 'letter' varies from language to 
language.
-        self.linkR = 
re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>%s)'
-                                % linktrail)
+        self.linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
         self.comment = i18n.twtranslate(self.pageToUnlink.site, 
'unlink-unlinking',
                                         self.pageToUnlink.title())
 

-- 
To view, visit https://gerrit.wikimedia.org/r/205818
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If72ae65a708b31d54ffd9e08408d5692fd3e0982
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Move internal link regex to textlib - change (pywikibot/core)

Reply via email to