John Vandenberg has uploaded a new change for review.
https://gerrit.wikimedia.org/r/205818
Change subject: Move internal link regex to textlib
......................................................................
Move internal link regex to textlib
Change-Id: If72ae65a708b31d54ffd9e08408d5692fd3e0982
---
M pywikibot/textlib.py
M scripts/cosmetic_changes.py
M scripts/disambredir.py
M scripts/fixing_redirects.py
M scripts/selflink.py
M scripts/solve_disambiguation.py
M scripts/unlink.py
7 files changed, 35 insertions(+), 58 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/18/205818/1
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 4c2da87..efb5c9b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -36,6 +36,25 @@
TEMP_REGEX = re.compile(
r'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
+# The regular expression which finds links. Results consist of five
+# groups:
+# 1. <newline> depends whether the links starts with a new line.
+# 2. <title> is the target page title, everything before | or ].
+# 3. <section> is the page section. It'll include the # to make life
+# easier for us.
+# 4. <label> is the alternative link title between | and ].
+# 5. <linktrail> is the link trail after ]] which are part of the word.
+# Note: the definition of 'letter' varies from language to language.
+# Note: site.linktrail must be substituted into this regex, and flags=re.X
+# used to as it uses whitespace mode.
+INTERNAL_LINK_REGEX = r"""
+ (?P<newline>[\n]*)
+ \[\[ (?P<title> [^\[\]\|#]*)
+ (?P<section> \#[^\]\|]*)?
+ (\|(?P<label> [^\]]*))?
+ \]\] (?P<linktrail>%s)
+"""
+
NON_LATIN_DIGITS = {
'ckb': u'٠١٢٣٤٥٦٧٨٩',
'fa': u'۰۱۲۳۴۵۶۷۸۹',
diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py
index bb99a30..d0dab20 100755
--- a/scripts/cosmetic_changes.py
+++ b/scripts/cosmetic_changes.py
@@ -441,7 +441,10 @@
# helper function which works on one link and either returns it
# unmodified, or returns a replacement.
def handleOneLink(match):
- titleWithSection = match.group('titleWithSection')
+ titleWithSection = match.group('title')
+ if match.group('section'):
+ titleWithSection += match.group('section')
+
label = match.group('label')
trailingChars = match.group('linktrail')
newline = match.group('newline')
@@ -546,16 +549,7 @@
return match.group()
trailR = re.compile(self.site.linktrail())
- # The regular expression which finds links. Results consist of four groups:
- # group <newline> depends whether the links starts with a new line.
- # group <titleWithSection> is the page title and section, that is,
- # everything before | or ]. It'll include the # to make life easier for us.
- # group <label> is the alternative link title between | and ].
- # group <linktrail> is the link trail after ]] which are part of the word.
- # note that the definition of 'letter' varies from language to language.
- linkR = re.compile(
-
r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>'
+
- self.site.linktrail() + ')')
+ linkR = re.compile(textlib.INTERNAL_LINK % self.site.linktrail(), re.X)
text = textlib.replaceExcept(text, linkR, handleOneLink,
['comment', 'math', 'nowiki', 'pre',
diff --git a/scripts/disambredir.py b/scripts/disambredir.py
index fe5bd49..5eae272 100755
--- a/scripts/disambredir.py
+++ b/scripts/disambredir.py
@@ -20,8 +20,10 @@
__version__ = '$Id$'
#
import re
+
import pywikibot
-from pywikibot import i18n, pagegenerators
+
+from pywikibot import i18n, pagegenerators, textlib
from pywikibot.tools import first_lower, first_upper as firstcap
msg = {
@@ -45,9 +47,7 @@
# make a backup of the original text so we can show the changes later
mysite = pywikibot.Site()
linktrail = mysite.linktrail()
- linkR = re.compile(
-
r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>%s)'
- % linktrail)
+ linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
curpos = 0
# This loop will run until we have finished the current page
while True:
diff --git a/scripts/fixing_redirects.py b/scripts/fixing_redirects.py
index ebfb555..1624a62 100755
--- a/scripts/fixing_redirects.py
+++ b/scripts/fixing_redirects.py
@@ -25,9 +25,10 @@
#
import re
import sys
+
import pywikibot
-from pywikibot import pagegenerators
-from pywikibot import i18n
+
+from pywikibot import i18n, pagegenerators, textlib
from pywikibot.tools import first_lower, first_upper as firstcap
# This is required for the text that is shown when you run this script
@@ -65,8 +66,7 @@
linktrail = mysite.linktrail()
# make a backup of the original text so we can show the changes later
- linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?'
- r'(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail
+ ')')
+ linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
curpos = 0
# This loop will run until we have finished the current page
while True:
diff --git a/scripts/selflink.py b/scripts/selflink.py
index bcd284c..6ff897c 100755
--- a/scripts/selflink.py
+++ b/scripts/selflink.py
@@ -39,22 +39,7 @@
super(SelflinkBot, self).__init__(**kwargs)
self.generator = generator
linktrail = pywikibot.Site().linktrail()
- # The regular expression which finds links. Results consist of four
- # groups:
- # group title is the target page title, everything before | or ].
- # group section is the page section. It'll include the # to make life
- # easier for us.
- # group label is the alternative link title, that's everything between
- # | and ].
- # group linktrail is the link trail, that's letters after ]] which are
- # part of the word.
- # note that the definition of 'letter' varies from language to
- # language.
- self.linkR = re.compile(
- r'\[\[(?P<title>[^\]\|#]*)'
- r'(?P<section>#[^\]\|]*)?'
- r'(\|(?P<label>[^\]]*))?\]\]'
- r'(?P<linktrail>' + linktrail + ')')
+ self.linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
def handleNextLink(self, page, match, context=100):
"""Process the next link on a page, offering the user choices.
diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index b91f0b3..f41bc05 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -535,12 +535,7 @@
# group linktrail is the link trail, that's letters after ]] which
# are part of the word.
# note that the definition of 'letter' varies from language to
language.
- self.linkR = re.compile(r'''
- \[\[ (?P<title> [^\[\]\|#]*)
- (?P<section> \#[^\]\|]*)?
- (\|(?P<label> [^\]]*))? \]\]
- (?P<linktrail>%s)''' % linktrail,
- flags=re.X)
+ self.linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
def treat(self, refPage, disambPage):
"""
diff --git a/scripts/unlink.py b/scripts/unlink.py
index 103a9a0..8832a91 100755
--- a/scripts/unlink.py
+++ b/scripts/unlink.py
@@ -53,23 +53,7 @@
self.generator = pageToUnlink.getReferences(
namespaces=self.getOption('namespaces'), content=True)
- # The regular expression which finds links. Results consist of four
- # groups:
- #
- # group title is the target page title, that is, everything
- # before | or ].
- #
- # group section is the page section.
- # It'll include the # to make life easier for us.
- #
- # group label is the alternative link title, that's everything
- # between | and ].
- #
- # group linktrail is the link trail, that's letters after ]] which are
- # part of the word.
- # note that the definition of 'letter' varies from language to
language.
- self.linkR =
re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>%s)'
- % linktrail)
+ self.linkR = re.compile(textlib.INTERNAL_LINK_REGEX % linktrail, re.X)
self.comment = i18n.twtranslate(self.pageToUnlink.site,
'unlink-unlinking',
self.pageToUnlink.title())
--
To view, visit https://gerrit.wikimedia.org/r/205818
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If72ae65a708b31d54ffd9e08408d5692fd3e0982
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits