John Vandenberg has uploaded a new change for review.
https://gerrit.wikimedia.org/r/150872
Change subject: Introduce static method Link.normalize(title)
......................................................................
Introduce static method Link.normalize(title)
Moves normalisation and basic validation logic from
Link.__init__ and Link.parse_test into a new static method
which can be used on a string which is a title when validation
of site namespaces and interwikis is not desirable.
Fixes bug 61832
Change-Id: I7021e3d7e40d72fd74709f396e967c9803248c52
---
M pywikibot/page.py
M pywikibot/site.py
M scripts/cosmetic_changes.py
M tests/wikibase_tests.py
4 files changed, 92 insertions(+), 43 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/72/150872/1
diff --git a/pywikibot/page.py b/pywikibot/page.py
index 7a0d1bd..b17cded 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -1234,12 +1234,15 @@
# element into a list in the format used by old scripts
result = []
for template in templates:
- link = pywikibot.Link(template[0], self.site,
- defaultNamespace=10)
try:
+ link = pywikibot.Link(template[0], self.site,
+ defaultNamespace=10)
if link.canonical_title() not in titles:
continue
- except pywikibot.Error:
+ except pywikibot.InvalidTitle:
+ # TODO: this exception handling should not be necessary,
+ # however extract_templates_and_params on en:wp mainpage
+ # returns titles like '#if:{{Main Page banner}}'.
# this is a parser function or magic word, not template name
continue
args = template[1]
@@ -3505,10 +3508,14 @@
following a '|' character inside the link
"""
-
+ # The hash/number symbol and vertical pipe symbol are valid in a Link,
+ # but are not valid in a Title.
illegal_titles_pattern = re.compile(
+ r'''[\x23\x7c]'''
+ )
+ illegal_link_pattern = re.compile(
# Matching titles will be held as illegal.
- r'''[\x00-\x1f\x23\x3c\x3e\x5b\x5d\x7b\x7c\x7d\x7f]'''
+ r'''[\x00-\x1f\x3c\x3e\x5b\x5d\x7b\x7d\x7f]'''
# URL percent encoding sequences interfere with the ability
# to round-trip titles -- you can't link to them consistently.
u'|%[0-9A-Fa-f]{2}'
@@ -3531,27 +3538,40 @@
contain one (defaults to 0)
@type defaultNamespace: int
+ @exception InvalidTitle: The title is not valid.
"""
assert source is None or isinstance(source, pywikibot.site.BaseSite), \
"source parameter should be a Site object"
+
+ if not text:
+ raise pywikibot.InvalidTitle('No title')
self._text = text
self._source = source or pywikibot.Site()
self._defaultns = defaultNamespace
- # preprocess text (these changes aren't site-dependent)
- # First remove anchor, which is stored unchanged, if there is one
- if u"|" in self._text:
- self._text, self._anchor = self._text.split(u"|", 1)
- else:
- self._anchor = None
-
# Clean up the name, it can come from anywhere.
- # Convert HTML entities to unicode
- t = html2unicode(self._text)
# Convert URL-encoded characters to unicode
- t = url2unicode(t, site=self._source)
+ # FIXME: to be moved into normalize after
+ # I9ca2a933d227afa79de8ce402304592682785d17
+ t = url2unicode(self._text, site=self._source)
+
+ self._text = Link.normalize(t)
+
+ @staticmethod
+ def normalize(title):
+ """
+ Normalise a title, with basic non-site specific validation.
+
+ @param title: title to normalise
+ @type title: unicode
+ @return: unicode
+
+ @exception InvalidTitle: The title is not valid.
+ """
+ # Convert HTML entities to unicode
+ t = html2unicode(title)
# Normalize unicode string to a NFC (composed) format to allow
# proper string comparisons. According to
@@ -3563,8 +3583,8 @@
# This code was adapted from Title.php : secureAndSplit()
#
if u'\ufffd' in t:
- raise pywikibot.Error(
- "Title contains illegal char (\\uFFFD 'REPLACEMENT
CHARACTER')")
+ raise pywikibot.InvalidTitle(
+ "Title contains illegal char \\uFFFD (REPLACEMENT CHARACTER)")
# Replace underscores by spaces
t = t.replace(u"_", u" ")
@@ -3572,10 +3592,41 @@
while u" " in t:
t = t.replace(u" ", u" ")
# Strip spaces at both ends
+ # TODO: Stripping trailing spaces breaks linktrails
+ # and may cause the same issue with leading spaces.
t = t.strip()
# Remove left-to-right and right-to-left markers.
t = t.replace(u"\u200e", u"").replace(u"\u200f", u"")
- self._text = t
+
+ # Reject illegal characters.
+ m = Link.illegal_link_pattern.search(t)
+ if m:
+ raise pywikibot.InvalidTitle(
+ u"%s contains illegal char(s) %s"
+ % (repr(t), repr(m.group(0))))
+
+ # Pages with "/./" or "/../" appearing in the URLs will
+ # often be unreachable due to the way web browsers deal
+ # * with 'relative' URLs. Forbid them explicitly.
+
+ if u'.' in t and (
+ t == u'.' or t == u'..'
+ or t.startswith(u"./")
+ or t.startswith(u"../")
+ or u"/./" in t
+ or u"/../" in t
+ or t.endswith(u"/.")
+ or t.endswith(u"/..")
+ ):
+ raise pywikibot.InvalidTitle(
+ "(contains . / combinations): '%s'"
+ % self._text)
+
+ # Magic tilde sequences? Nu-uh!
+ if u"~~~" in t:
+ raise pywikibot.InvalidTitle("(contains ~~~): '%s'" % self._text)
+
+ return t
def __repr__(self):
"""Return a more complete string representation."""
@@ -3628,6 +3679,14 @@
"""
self._site = self._source
self._namespace = self._defaultns
+
+ # preprocess text (these changes aren't site-dependent)
+ # First remove anchor, which is stored unchanged, if there is one
+ if u"|" in self._text:
+ self._text, self._anchor = self._text.split(u"|", 1)
+ else:
+ self._anchor = None
+
t = self._text
# This code was adapted from Title.php : secureAndSplit()
@@ -3698,27 +3757,6 @@
if m:
raise pywikibot.InvalidTitle(
u"%s contains illegal char(s) %s" % (repr(t),
repr(m.group(0))))
-
- # Pages with "/./" or "/../" appearing in the URLs will
- # often be unreachable due to the way web browsers deal
- # * with 'relative' URLs. Forbid them explicitly.
-
- if u'.' in t and (
- t == u'.' or t == u'..'
- or t.startswith(u"./")
- or t.startswith(u"../")
- or u"/./" in t
- or u"/../" in t
- or t.endswith(u"/.")
- or t.endswith(u"/..")
- ):
- raise pywikibot.InvalidTitle(
- "(contains . / combinations): '%s'"
- % self._text)
-
- # Magic tilde sequences? Nu-uh!
- if u"~~~" in t:
- raise pywikibot.InvalidTitle("(contains ~~~): '%s'" % self._text)
if self._namespace != -1 and len(t) > 255:
raise pywikibot.InvalidTitle("(over 255 bytes): '%s'" % t)
diff --git a/pywikibot/site.py b/pywikibot/site.py
index c26e8cd..b00c16c 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -171,7 +171,12 @@
user = user[0].upper() + user[1:]
if sysop:
sysop = sysop[0].upper() + sysop[1:]
+ if user:
+ user = pywikibot.Link.normalize(user)
+ if sysop:
+ sysop = pywikibot.Link.normalize(sysop)
self._username = [user, sysop]
+
self.use_hard_category_redirects = (
self.code in self.family.use_hard_category_redirects)
@@ -424,6 +429,14 @@
if title1 == title2:
return True
+ try:
+ title1 = pywikibot.Link.normalize(title1)
+ title2 = pywikibot.Link.normalize(title2)
+ except InvalidTitle:
+ return False
+ if title1 == title2:
+ return True
+
# determine whether titles contain namespace prefixes
if ":" in title1:
ns1, name1 = title1.split(":", 1)
diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py
index 34a2d4f..a305c31 100755
--- a/scripts/cosmetic_changes.py
+++ b/scripts/cosmetic_changes.py
@@ -405,6 +405,8 @@
except pywikibot.InvalidTitle:
return match.group()
if namespace == 0:
+ # TODO: This logic could be merged with Link.normalize
+
# Replace underlines by spaces, also multiple underlines
titleWithSection = re.sub('_+', ' ', titleWithSection)
# Remove double spaces
diff --git a/tests/wikibase_tests.py b/tests/wikibase_tests.py
index aca60b3..a76457e 100644
--- a/tests/wikibase_tests.py
+++ b/tests/wikibase_tests.py
@@ -289,10 +289,6 @@
self.assertEquals(hasattr(item, '_content'), True)
self.assertEquals(item.exists(), True)
- def test_fromPage_invalid_title(self):
- page = pywikibot.Page(pywikibot.page.Link("[]", site))
- self.assertRaises(pywikibot.InvalidTitle, pywikibot.ItemPage.fromPage,
page)
-
def _test_fromPage_noitem(self, link):
"""Helper function to test a page without an associated item.
--
To view, visit https://gerrit.wikimedia.org/r/150872
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I7021e3d7e40d72fd74709f396e967c9803248c52
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits