redirect.py

alexsh Fri, 02 Oct 2009 13:34:27 -0700

Revision: 7358
Author:   alexsh
Date:     2009-10-02 20:34:20 +0000 (Fri, 02 Oct 2009)


Log Message:
-----------
RedirectGenerator().get_redirect_pageids_via_api(): change API Query type to 
JSON(tested ok)

Modified Paths:
--------------
    trunk/pywikipedia/redirect.py

Modified: trunk/pywikipedia/redirect.py
===================================================================
--- trunk/pywikipedia/redirect.py       2009-10-02 20:12:56 UTC (rev 7357)
+++ trunk/pywikipedia/redirect.py       2009-10-02 20:34:20 UTC (rev 7358)
@@ -61,7 +61,7 @@
 #
 #
 from __future__ import generators
-import wikipedia, config
+import wikipedia, config, query
 import xmlreader
 import re, sys
 
@@ -259,8 +259,7 @@
         else:
             return redict
 
-    def get_redirect_pageids_via_api(self, number = u'max', namespaces = [],
-                             start = None, until = None ):
+    def get_redirect_pageids_via_api(self, number = u'max', namespaces = [], 
start = None, until = None ):
         """
         Generator which will yield page IDs of Pages that are redirects.
         Get number of page ids in one go.
@@ -268,45 +267,37 @@
         In each namespace, start alphabetically from a pagetitle start, wich 
need not exist.
         """
         # wikipedia.output(u'====> get_redirect_pageids_via_api(number=%s, 
#ns=%d, start=%s, until=%s)' % (number, len(namespaces), start, until))
-        import urllib
         if namespaces == []:
             namespaces = [ 0 ]
-        apiQ0 = self.site.api_address()
-        apiQ0 += 'action=query'
-        apiQ0 += '&list=allpages'
-        apiQ0 += '&apfilterredir=redirects'
-        apiQ0 += '&aplimit=%s' % number
-        apiQ0 += '&format=xml'
-        apPageTitleRe = re.compile(' pageid="(.*?)" .*? title="(.*?)"')
-        apPageIdRe = re.compile(' pageid="(.*?)"')
-        apfromRe = re.compile(' apfrom="(.*?)"')
+        params = {
+            'action':'query',
+            'list':'allpages',
+            'apfilterredir':'redirects',
+            'aplimit':number,
+            'apdir':'ascending',
+            #'':'',
+        }
+        
         for ns in namespaces:
             # print (ns)
-            apiQns = apiQ0 + '&apnamespace=%s' % ns
+            params['apnamespace'] = ns
             # print (apiQns)
-            while apiQns:
-                apiQ = apiQns
+            while True:
                 if start:
-                    apiQ += '&apfrom=%s' % 
urllib.quote(start.encode(site.encoding()))
+                    params['apfrom'] = start
                 # print (apiQ)
-                result = site.getUrl(apiQ)
+                data = query.GetData(params, self.site)
                 # wikipedia.output(u'===RESULT===\n%s\n' % result)
-                if until:
-                    for (pageid, pagetitle) in apPageTitleRe.findall(result):
-                        # wikipedia.output(u'===PAGEID=%s: %s' % (pageid, 
pagetitle)) ## TODO: make this a -verbose mode output, independant of -until
-                        if pagetitle > until:
-                           apiQns = None
-                           break
-                        yield pageid
+                for x in data['query']['allpages']:
+                    if until and x['title'] == until:
+                        break
+                    yield x['pageid']
+                    
+                if 'query-continue' in data:
+                    params['apfrom'] = 
data['query-continue']['allpages']['apfrom']
                 else:
-                    for pageid in apPageIdRe.findall(result):
-                        # wikipedia.output(u'===PAGEID=%s' % pageid)
-                        yield pageid
-                m = apfromRe.search(result)
-                if m:
-                    start = m.group(1)
-                else:
                     break
+                
 
     def _next_redirects_via_api_commandline(self, apiQi, number = 'max', 
namespaces = [],
                             start = None, until = None ):
@@ -318,8 +309,7 @@
             namespaces = [ 0 ]
         maxurllen = 1018    # accomodate "GET " + apiQ + CR + LF in 1024 bytes.
         apiQ = ''
-        for pageid in self.get_redirect_pageids_via_api(number = number, 
namespaces = namespaces,
-                             start = start, until = until ):
+        for pageid in self.get_redirect_pageids_via_api(number, namespaces, 
start, until):
             if apiQ:
                 tmp = ( '%s|%s' % ( apiQ, pageid ) )
             else:



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [7358] trunk/pywikipedia/redirect.py

Reply via email to