Revision: 7697
Author:   alexsh
Date:     2009-11-26 09:33:45 +0000 (Thu, 26 Nov 2009)

Log Message:
-----------
separate getall for huge-retrieve (only work in getAll pageCount > 
config.special_page_limit

Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py      2009-11-26 09:29:40 UTC (rev 7696)
+++ trunk/pywikipedia/wikipedia.py      2009-11-26 09:33:45 UTC (rev 7697)
@@ -3848,7 +3848,25 @@
     # TODO: why isn't this a Site method?
     pages = list(pages)  # if pages is an iterator, we need to make it a list
     output(u'Getting %d pages from %s...' % (len(pages), site))
-    _GetAll(site, pages, throttle, force).run()
+    limit = config.special_page_limit / 4 # default is 500/4, but It might 
have good point for server.
+    
+    if len(pages) > limit:
+        # separate export pages for bulk-retrieve
+        
+        for pagg in range(0, len(pages), limit):
+            if pagg == range(0, len(pages), limit)[-1]: #latest retrieve
+                k = pages[pagg:]
+                output(u'Getting pages %d - %d of %d...' % (pagg + 1, 
len(pages), len(pages)))
+                _GetAll(site, k, throttle, force).run()
+                pages[pagg:] = k
+            else:
+                k = pages[pagg:pagg + limit]
+                output(u'Getting pages %d - %d of %d...' % (pagg + 1, pagg + 
limit, len(pages)))
+                _GetAll(site, k, throttle, force).run()
+                pages[pagg:pagg + limit] = k
+            get_throttle(requestsize = len(pages) / 10) # one time to retrieve 
is 7.7 sec.
+    else:
+        _GetAll(site, pages, throttle, force).run()
 
 
 # Library functions



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

Reply via email to