http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11195
Revision: 11195
Author: russblau
Date: 2013-03-08 20:06:40 +0000 (Fri, 08 Mar 2013)
Log Message:
-----------
Fix Bug #3606570: Namespace alias error with preloading and page.exists(); this
turned out to be a pervasive problem caused by an unannounced (IIRC) breaking
change in the API. Formerly, the API always returned a page title that
contained the site's canonical namespace prefix, for pages outside namespace 0;
now, it may return a title using a namespace alias instead, as in the case of
the gender-specific namespaces identified in the bug report. Because it is
conceivable that other cases of non-canonical namespaces may exist either now
or in the future, this is a more general solution that checks all possible
namespace aliases before rejecting a page title.
Modified Paths:
--------------
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2013-03-08 08:12:45 UTC (rev 11194)
+++ branches/rewrite/pywikibot/site.py 2013-03-08 20:06:40 UTC (rev 11195)
@@ -392,6 +392,48 @@
% locals(),
re.IGNORECASE | re.UNICODE | re.DOTALL)
+ def sametitle(self, title1, title2):
+ """Return True iff title1 and title2 identify the same wiki page."""
+ # title1 and title2 may be unequal but still identify the same page,
+ # if they use different aliases for the same namespace
+ def valid_namespace(text, number):
+ """Return True iff text is a valid alias for namespace with given
number."""
+ for alias in self.namespace(number, all=True):
+ if text.lower() == alias.lower():
+ return True
+ return False
+ if title1 == title2:
+ return True
+ # determine whether titles contain namespace prefixes
+ if ":" in title1:
+ ns1, name1 = title1.split(":", 1)
+ else:
+ ns1, name1 = 0, title1
+ if ":" in title2:
+ ns2, name2 = title2.split(":", 1)
+ else:
+ ns2, name2 = 0, title2
+ for space in self.namespaces(): # iterate over all valid namespaces
+ if type(ns1) is not int and valid_namespace(ns1, space):
+ ns1 = space
+ if type(ns2) is not int and valid_namespace(ns2, space):
+ ns2 = space
+ if type(ns1) is not int:
+ # no valid namespace prefix found, so the string followed by ":"
+ # must be part of the title
+ name1 = ns1 + ":" + name1
+ ns1 = 0
+ if type(ns2) is not int:
+ name2 = ns2 + ":" + name2
+ ns2 = 0
+ if ns1 != ns2:
+ # pages in different namespaces
+ return False
+ if self.case() == "first-letter":
+ name1 = name1[:1].upper() + name1[1:]
+ name2 = name2[:1].upper() + name2[1:]
+ return name1 == name2
+
# namespace shortcuts for backwards-compatibility
def special_namespace(self):
@@ -1132,19 +1174,10 @@
titles=title.encode(self.encoding()),
inprop="protection")
for pageitem in query:
- if pageitem['title'] != title:
- if pageitem['title'] in query.normalized \
- and query.normalized[pageitem['title']] == title:
- # page title was normalized by api
- # this should never happen because the Link() constructor
- # normalizes the title
- pywikibot.log(
- u"loadpageinfo: Page title '%s' was normalized to '%s'"
- % (title, pageitem['title']))
- else:
- pywikibot.warning(
- u"loadpageinfo: Query on %s returned data on '%s'"
- % (page, pageitem['title']))
+ if not self.sametitle(pageitem['title'], title):
+ pywikibot.warning(
+ u"loadpageinfo: Query on %s returned data on '%s'"
+ % (page, pageitem['title']))
continue
api.update_page(page, pageitem)
@@ -1165,7 +1198,7 @@
"metadata", "archivename"],
**args)
for pageitem in query:
- if pageitem['title'] != title:
+ if not self.sametitle(pageitem['title'], title):
raise Error(
u"loadimageinfo: Query on %s returned data on '%s'"
% (page, pageitem['title']))
@@ -1236,7 +1269,7 @@
raise pywikibot.CircularRedirect(redirmap[title])
pagedata = result['query']['pages'].values()[0]
# there should be only one value in 'pages', and it is the target
- if pagedata['title'] == target_title:
+ if self.sametitle(pagedata['title'], target_title):
target = pywikibot.Page(self, pagedata['title'], pagedata['ns'])
api.update_page(target, pagedata)
page._redirtarget = target
@@ -1288,10 +1321,21 @@
pywikibot.debug(u"Preloading %s" % pagedata, _logger)
try:
if pagedata['title'] not in cache:
- pywikibot.warning(
- u"preloadpages: Query returned unexpected title '%s'"
- % pagedata['title'])
- continue
+# API always returns a "normalized" title which is
+# usually the same as the canonical form returned by
+# page.title(), but sometimes not (e.g.,
+# gender-specific localizations of "User" namespace).
+# This checks to see if there is a normalized title in
+# the response that corresponds to the canonical form
+# used in the query.
+ if pagedata['title'] in rvgen.normalized \
+ and rvgen.normalized[pagedata['title']] in
cache:
+ cache[pagedata['title']] =
cache[rvgen.normalized[pagedata['title']]]
+ else:
+ pywikibot.warning(
+ u"preloadpages: Query returned unexpected
title '%s'"
+ % pagedata['title'])
+ continue
except KeyError:
pywikibot.debug(u"No 'title' in %s" % pagedata, _logger)
pywikibot.debug(u"pageids=%s" % pageids, _logger)
@@ -1314,7 +1358,7 @@
intoken=tokentype,
site=self)
for item in query:
- if item['title'] != page.title(withSection=False):
+ if not self.sametitle(item['title'],
page.title(withSection=False)):
raise Error(
u"token: Query on page %s returned data on page [[%s]]"
% (page.title(withSection=False, asLink=True),
@@ -1705,21 +1749,11 @@
rvgen.continuekey = "revisions"
for pagedata in rvgen:
if page is not None:
- if pagedata['title'] != page.title(withSection=False):
- ok = False
- namespace = page.namespace()
- # gender settings ?
- if namespace in [2, 3]:
- ns, title = pagedata['title'].split(':', 1)
- if ns in page.site.namespace(namespace, all=True) and \
- title == page.title(withSection=False,
- withNamespace=False):
-
- ok = True
- if not ok:
- raise Error(
- u"loadrevisions: Query on %s returned data on '%s'"
- % (page, pagedata['title']))
+ if not self.sametitle(pagedata['title'],
+ page.title(withSection=False)):
+ raise Error(
+ u"loadrevisions: Query on %s returned data on '%s'"
+ % (page, pagedata['title']))
if "missing" in pagedata:
raise NoPage(page)
else:
@@ -1739,7 +1773,7 @@
titles=lltitle.encode(self.encoding()),
step=step, total=total)
for pageitem in llquery:
- if pageitem['title'] != lltitle:
+ if not self.sametitle(pageitem['title'], lltitle):
raise Error(
u"getlanglinks: Query on %s returned data on '%s'"
% (page, pageitem['title']))
@@ -1757,7 +1791,7 @@
titles=eltitle.encode(self.encoding()),
step=step, total=total)
for pageitem in elquery:
- if pageitem['title'] != eltitle:
+ if not self.sametitle(pageitem['title'], eltitle):
raise RuntimeError(
"getlanglinks: Query on %s returned data on '%s'"
% (page, pageitem['title']))
@@ -1773,7 +1807,7 @@
type_arg="categoryinfo",
titles=cititle.encode(self.encoding()))
for pageitem in ciquery:
- if pageitem['title'] != cititle:
+ if not self.sametitle(pageitem['title'], cititle):
raise Error(
u"categoryinfo: Query on %s returned data on '%s'"
% (category, pageitem['title']))
@@ -2591,7 +2625,7 @@
req['captchaword'] = input(captcha["question"])
continue
elif "url" in captcha:
- webbrowser.open(url)
+ webbrowser.open(captcha["url"])
req['captchaword'] = cap_answerwikipedia.input(
"Please view CAPTCHA in your browser, then type answer here:")
continue
@@ -3352,3 +3386,5 @@
f = open(fn)
self._cookies[index] = '; '.join([x.strip() for x in
f.readlines()])
f.close()
+
+
_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn