Revision: 7358
Author: alexsh
Date: 2009-10-02 20:34:20 +0000 (Fri, 02 Oct 2009)
Log Message:
-----------
RedirectGenerator().get_redirect_pageids_via_api(): change API Query type to
JSON(tested ok)
Modified Paths:
--------------
trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py
===================================================================
--- trunk/pywikipedia/redirect.py 2009-10-02 20:12:56 UTC (rev 7357)
+++ trunk/pywikipedia/redirect.py 2009-10-02 20:34:20 UTC (rev 7358)
@@ -61,7 +61,7 @@
#
#
from __future__ import generators
-import wikipedia, config
+import wikipedia, config, query
import xmlreader
import re, sys
@@ -259,8 +259,7 @@
else:
return redict
- def get_redirect_pageids_via_api(self, number = u'max', namespaces = [],
- start = None, until = None ):
+ def get_redirect_pageids_via_api(self, number = u'max', namespaces = [],
start = None, until = None ):
"""
Generator which will yield page IDs of Pages that are redirects.
Get number of page ids in one go.
@@ -268,45 +267,37 @@
In each namespace, start alphabetically from a pagetitle start, wich
need not exist.
"""
# wikipedia.output(u'====> get_redirect_pageids_via_api(number=%s,
#ns=%d, start=%s, until=%s)' % (number, len(namespaces), start, until))
- import urllib
if namespaces == []:
namespaces = [ 0 ]
- apiQ0 = self.site.api_address()
- apiQ0 += 'action=query'
- apiQ0 += '&list=allpages'
- apiQ0 += '&apfilterredir=redirects'
- apiQ0 += '&aplimit=%s' % number
- apiQ0 += '&format=xml'
- apPageTitleRe = re.compile(' pageid="(.*?)" .*? title="(.*?)"')
- apPageIdRe = re.compile(' pageid="(.*?)"')
- apfromRe = re.compile(' apfrom="(.*?)"')
+ params = {
+ 'action':'query',
+ 'list':'allpages',
+ 'apfilterredir':'redirects',
+ 'aplimit':number,
+ 'apdir':'ascending',
+ #'':'',
+ }
+
for ns in namespaces:
# print (ns)
- apiQns = apiQ0 + '&apnamespace=%s' % ns
+ params['apnamespace'] = ns
# print (apiQns)
- while apiQns:
- apiQ = apiQns
+ while True:
if start:
- apiQ += '&apfrom=%s' %
urllib.quote(start.encode(site.encoding()))
+ params['apfrom'] = start
# print (apiQ)
- result = site.getUrl(apiQ)
+ data = query.GetData(params, self.site)
# wikipedia.output(u'===RESULT===\n%s\n' % result)
- if until:
- for (pageid, pagetitle) in apPageTitleRe.findall(result):
- # wikipedia.output(u'===PAGEID=%s: %s' % (pageid,
pagetitle)) ## TODO: make this a -verbose mode output, independant of -until
- if pagetitle > until:
- apiQns = None
- break
- yield pageid
+ for x in data['query']['allpages']:
+ if until and x['title'] == until:
+ break
+ yield x['pageid']
+
+ if 'query-continue' in data:
+ params['apfrom'] =
data['query-continue']['allpages']['apfrom']
else:
- for pageid in apPageIdRe.findall(result):
- # wikipedia.output(u'===PAGEID=%s' % pageid)
- yield pageid
- m = apfromRe.search(result)
- if m:
- start = m.group(1)
- else:
break
+
def _next_redirects_via_api_commandline(self, apiQi, number = 'max',
namespaces = [],
start = None, until = None ):
@@ -318,8 +309,7 @@
namespaces = [ 0 ]
maxurllen = 1018 # accomodate "GET " + apiQ + CR + LF in 1024 bytes.
apiQ = ''
- for pageid in self.get_redirect_pageids_via_api(number = number,
namespaces = namespaces,
- start = start, until = until ):
+ for pageid in self.get_redirect_pageids_via_api(number, namespaces,
start, until):
if apiQ:
tmp = ( '%s|%s' % ( apiQ, pageid ) )
else:
_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn