Revision: 7720
Author:   alexsh
Date:     2009-11-30 18:48:56 +0000 (Mon, 30 Nov 2009)

Log Message:
-----------
Page().getReferences(): add API mode, use query:backlinks and query:embeddedin. 
Move ordinary to getReferencesOld().

Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py      2009-11-30 14:51:16 UTC (rev 7719)
+++ trunk/pywikipedia/wikipedia.py      2009-11-30 18:48:56 UTC (rev 7720)
@@ -1252,14 +1252,12 @@
             self._isDisambig = len(disambigInPage) > 0
         return self._isDisambig
 
-    def getReferences(self,
-            follow_redirects=True, withTemplateInclusion=True,
-            onlyTemplateInclusion=False, redirectsOnly=False):
-        """Yield all pages that link to the page.
+    def getReferences(self, follow_redirects=True, withTemplateInclusion=True,
+            onlyTemplateInclusion=False, redirectsOnly=False, internal = 
False):
+        """Yield all pages that link to the page by API
 
         If you need a full list of referring pages, use this:
             pages = [page for page in s.getReferences()]
-
         Parameters:
         * follow_redirects      - if True, also returns pages that link to a
                                   redirect pointing to the page.
@@ -1270,6 +1268,98 @@
         * redirectsOnly         - if True, only returns redirects to self.
 
         """
+        try:
+            if config.use_api and self.site().versionnumber() > 9:
+                d = self.site().apipath()
+                del d
+            else:
+                raise NotImplementedError
+        except NotImplementedError:
+            for s in self.getReferencesOld(follow_redirects, 
withTemplateInclusion, onlyTemplateInclusion, redirectsOnly):
+                yield s
+            return
+            
+        params = {
+            'action': 'query',
+            'list': [],
+        }
+        if not onlyTemplateInclusion:
+            params['list'].append('backlinks')
+            params['bltitle'] = self.title()
+            params['bllimit'] = config.special_page_limit
+            params['blfilterredir'] = 'all'
+            if follow_redirects:
+                params['blredirect'] = 1
+            if redirectsOnly:
+                params['blfilterredir'] = 'redirects'
+            if not self.site().isAllowed('apihighlimits') and 
config.special_page_limit > 500:
+                params['bllimit'] = 500
+        
+        if withTemplateInclusion or onlyTemplateInclusion:
+            params['list'].append('embeddedin')
+            params['eititle'] = self.title()
+            params['eilimit'] = config.special_page_limit
+            params['eifilterredir'] = 'all'
+            if follow_redirects:
+                params['eiredirect'] = 1
+            if redirectsOnly:
+                params['eifilterredir'] = 'redirects'
+            if not self.site().isAllowed('apihighlimits') and 
config.special_page_limit > 500:
+                params['eilimit'] = 5000
+        
+        allDone = False
+        
+        while not allDone:
+            if not internal:
+                output(u'Getting references to %s via API...' % self.aslink())
+            
+            datas = query.GetData(params, self.site())
+            data = datas['query'].values()
+            if len(data) == 2:
+                data = data[0] + data[1]
+            else:
+                data = data[0]
+            
+            refPages = set()
+            for blp in data:
+                pg = Page(self.site(), blp['title'], defaultNamespace = 
blp['ns'])
+                if pg in refPages:
+                    continue
+                
+                yield pg
+                refPages.add(pg)
+                if follow_redirects and 'redirect' in blp and 'redirlinks' in 
blp:
+                    for p in blp['redirlinks']:
+                        plk = Page(self.site(), p['title'], defaultNamespace = 
p['ns'])
+                        if plk in refPages:
+                            continue
+                        
+                        yield plk
+                        refPages.add(plk)
+                        if follow_redirects and 'redirect' in p:
+                            for zms in plk.getReferences(follow_redirects, 
withTemplateInclusion, 
+                                              onlyTemplateInclusion, 
redirectsOnly, internal=True):
+                                yield zms
+                        else:
+                            continue
+                else:
+                    continue
+            
+            if 'query-continue' in datas:
+                if 'backlinks' in datas['query-continue']:
+                    params['blcontinue'] = 
datas['query-continue']['backlinks']['blcontinue']
+                
+                if 'embeddedin' in datas['query-continue']:
+                    params['eicontinue'] = 
datas['query-continue']['embeddedin']['eicontinue']
+            else:
+                allDone = True
+            
+    
+    def getReferencesOld(self,
+            follow_redirects=True, withTemplateInclusion=True,
+            onlyTemplateInclusion=False, redirectsOnly=False):
+        """Yield all pages that link to the page.
+        """
         # Temporary bug-fix while researching more robust solution:
         if config.special_page_limit > 999:
             config.special_page_limit = 999
@@ -5613,7 +5703,15 @@
         if retry is None:
             retry = config.retry_on_fail
 
-        headers = {'User-agent': useragent,}
+        headers = {
+            'User-agent': useragent,
+            #'Accept-Language': config.mylang,
+            #'Accept-Charset': config.textfile_encoding,
+            #'Keep-Alive': '115',
+            #'Connection': 'keep-alive',
+            #'Cache-Control': 'max-age=0',
+            #'': '',
+        }
         
         if not no_hostname and self.cookies(sysop = sysop):
             headers['Cookie'] = self.cookies(sysop = sysop)



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

Reply via email to