pywikibot

russblau Thu, 24 Jun 2010 12:10:09 -0700

Revision: 8324
Author:   russblau
Date:     2010-06-24 19:09:47 +0000 (Thu, 24 Jun 2010)


Log Message:
-----------
Add "content" option for PageGenerator and all methods that use it, to allow 
preloading of page content without a separate ".preloadpages()" call.

Modified Paths:
--------------
    branches/rewrite/pywikibot/data/api.py
    branches/rewrite/pywikibot/page.py
    branches/rewrite/pywikibot/pagegenerators.py
    branches/rewrite/pywikibot/site.py

Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py      2010-06-24 19:00:35 UTC (rev 
8323)
+++ branches/rewrite/pywikibot/data/api.py      2010-06-24 19:09:47 UTC (rev 
8324)
@@ -409,12 +409,13 @@
             if name not in _modules:
                 self.get_module()
                 break
+        self.request = Request(**kwargs)
         self.prefix = None
         self.update_limit() # sets self.prefix
-        if self.query_limit is not None and "generator" in kwargs:
+        if self.api_limit is not None and "generator" in kwargs:
             self.prefix = "g" + self.prefix
-        self.request = Request(**kwargs)
         self.limit = None
+        self.query_limit = self.api_limit
         if "generator" in kwargs:
             self.resultkey = "pages"        # name of the "query" subelement 
key
         else:                               # to look for when iterating
@@ -448,8 +449,10 @@
         limit = int(value)
         # don't update if limit is greater than maximum allowed by API
         self.update_limit()
-        if self.query_limit is None or limit < self.query_limit:
-            self.query_limit = int(limit)
+        if self.api_limit is None:
+            self.query_limit = limit
+        else:
+            self.query_limit = min(self.api_limit, limit)
 
     def set_maximum_items(self, value):
         """Set the maximum number of items to be retrieved from the wiki.
@@ -466,23 +469,23 @@
         self.limit = int(value)
 
     def update_limit(self):
-        """Set query_limit for self.module based on api response"""
+        """Set query limit for self.module based on api response"""
 
-        self.query_limit = None
+        self.api_limit = None
         for mod in self.module.split('|'):
             for param in _modules[mod].get("parameters", []):
                 if param["name"] == "limit":
                     if (self.site.logged_in()
                             and "apihighlimits" in
                                 self.site.getuserinfo()["rights"]):
-                        self.query_limit = int(param["highmax"])
+                        self.api_limit = int(param["highmax"])
                     else:
-                        self.query_limit = int(param["max"])
+                        self.api_limit = int(param["max"])
                     if self.prefix is None:
                         self.prefix = _modules[mod]["prefix"]
                     pywikibot.debug(u"%s: Set query_limit to %i."
                                       % (self.__class__.__name__,
-                                         self.query_limit),
+                                         self.api_limit),
                                     _logger)
                     return
 
@@ -517,6 +520,13 @@
                     new_limit = min(self.query_limit, self.limit - count)
                 else:
                     new_limit = None
+                if "rvprop" in self.request \
+                        and "content" in self.request["rvprop"]:
+                    # queries that retrieve page content have lower limits
+                    # Note: although API allows up to 500 pages for content
+                    #   queries, these sometimes result in server-side errors
+                    #   so use 250 as a safer limit
+                    new_limit = min(new_limit, self.api_limit // 10, 250)
                 if new_limit is not None:
                     self.request[self.prefix+"limit"] = str(new_limit)
             try:
@@ -596,30 +606,39 @@
     this class iterate Page objects.
 
     """
-    def __init__(self, generator, **kwargs):
+    def __init__(self, generator, g_content=False, **kwargs):
         """
         Required and optional parameters are as for C{Request}, except that
         action=query is assumed and generator is required.
 
         @param generator: the "generator=" type from api.php
         @type generator: str
+        @param g_content: if True, retrieve the contents of the current
+            version of each Page (default False)
 
         """
-        QueryGenerator.__init__(self, generator=generator, **kwargs)
         # get some basic information about every page generated
-        if 'prop' in self.request:
-            self.request['prop'] += "|info|imageinfo|categoryinfo"
+        if 'prop' in kwargs:
+            kwargs['prop'] += "|info|imageinfo|categoryinfo"
         else:
-            self.request['prop'] = 'info|imageinfo|categoryinfo'
-        if "inprop" in self.request:
-            if "protection" not in self.request["inprop"]:
-                self.request["inprop"] += "|protection"
+            kwargs['prop'] = 'info|imageinfo|categoryinfo'
+        if g_content:
+            # retrieve the current revision
+            kwargs['prop'] += "|revisions"
+            if "rvprop" in kwargs:
+                kwargs["rvprop"] += "ids|timestamp|flags|comment|user|content"
+            else:
+                kwargs["rvprop"] = "ids|timestamp|flags|comment|user|content"
+        if "inprop" in kwargs:
+            if "protection" not in kwargs["inprop"]:
+                kwargs["inprop"] += "|protection"
         else:
-            self.request['inprop'] = 'protection'
-        if "iiprop" in self.request:
-            self.request["iiprop"] += 
'timestamp|user|comment|url|size|sha1|metadata'
+            kwargs['inprop'] = 'protection'
+        if "iiprop" in kwargs:
+            kwargs["iiprop"] += 'timestamp|user|comment|url|size|sha1|metadata'
         else:
-            self.request['iiprop'] = 
'timestamp|user|comment|url|size|sha1|metadata'
+            kwargs['iiprop'] = 'timestamp|user|comment|url|size|sha1|metadata'
+        QueryGenerator.__init__(self, generator=generator, **kwargs)
         self.resultkey = "pages" # element to look for in result
 
     def result(self, pagedata):

Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py  2010-06-24 19:00:35 UTC (rev 8323)
+++ branches/rewrite/pywikibot/page.py  2010-06-24 19:09:47 UTC (rev 8324)
@@ -74,72 +74,20 @@
         """
         if isinstance(source, pywikibot.site.BaseSite):
             self._link = Link(title, source=source, defaultNamespace=ns)
-##            self._site = source
-##            if ns not in source.namespaces():
-##                raise pywikibot.Error(
-##                      "Invalid namespace '%i' for site %s."
-##                      % (ns, source.sitename()))
-##            self._ns = ns
-##            if ns and not title.startswith(source.namespace(ns)+u":"):
-##                title = source.namespace(ns) + u":" + title
-##            elif not ns and u":" in title:
-##                pos = title.index(u':')
-##                nsindex = source.ns_index(title[ :pos])
-##                if nsindex:
-##                    self._ns = nsindex
-##                    # normalize namespace, in case an alias was used
-##                    title = source.namespace(nsindex) + title[pos: ]
-##            if u"#" in title:
-##                title, self._section = title.split(u"#", 1)
-##            else:
-##                self._section = None
-##            if not title:
-##                raise pywikibot.Error(
-##                      "Page object cannot be created from Site without 
title.")
-##            self._title = title
         elif isinstance(source, Page):
             # copy all of source's attributes to this object
             self.__dict__ = source.__dict__
             if title:
                 # overwrite title
                 self._link = Link(title, source=source.site, 
defaultNamespace=ns)
-##                if ":" in title:
-##                    prefix = title[ :title.index(":")]
-##                    self._ns = self._site.ns_index(prefix)
-##                    if self._ns is None:
-##                        self._ns = 0
-##                    else:
-##                        title = title[title.index(":")+1 : ].strip(" _")
-##                        self._title = "%s:%s" % (
-##                                         self.site.namespace(self._ns),
-##                                         self._title)
-##                else:
-##                    self._ns = 0
-##                if "#" in title:
-##                    self._section = title[title.index("#") + 1 : ].strip(" 
_")
-##                    title = title[ : title.index("#")].strip(" _")
-##                self._title = title
         elif isinstance(source, Link):
             self._link = source
-##            self._site = source.site
-##            self._section = source.section
-##            self._ns = source.namespace
-##            self._title = source.title
-##            # reassemble the canonical title from components
-##            if self._ns:
-##                self._title = "%s:%s" % (self.site.namespace(self._ns),
-##                                         self._title)
         else:
             raise pywikibot.Error(
                   "Invalid argument type '%s' in Page constructor: %s"
                   % (type(source), source))
-##        if self._section is not None:
-##            self._title = self._title + "#" + self._section
         self._revisions = {}
 
-##        # Always capitalize the first letter
-##        self._title = self._title[:1].upper() + self._title[1:]
-
     @property
     def site(self):
         """Return the Site object for the wiki on which this Page resides."""
@@ -560,7 +508,7 @@
 
     def getReferences(self, follow_redirects=True, withTemplateInclusion=True,
                       onlyTemplateInclusion=False, redirectsOnly=False,
-                      namespaces=None, step=None, total=None):
+                      namespaces=None, step=None, total=None, content=False):
         """Return an iterator all pages that refer to or embed the page.
 
         If you need a full list of referring pages, use
@@ -576,6 +524,8 @@
         @param namespaces: only iterate pages in these namespaces
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each referring page (default False)
 
         """
         # N.B.: this method intentionally overlaps with backlinks() and
@@ -590,10 +540,10 @@
                                withTemplateInclusion=withTemplateInclusion,
                                onlyTemplateInclusion=onlyTemplateInclusion,
                                namespaces=namespaces, step=step,
-                               total=total)
+                               total=total, content=content)
 
     def backlinks(self, followRedirects=True, filterRedirects=None,
-                  namespaces=None, step=None, total=None):
+                  namespaces=None, step=None, total=None, content=False):
         """Return an iterator for pages that link to this page.
 
         @param followRedirects: if True, also iterate pages that link to a
@@ -603,16 +553,18 @@
         @param namespaces: only iterate pages in these namespaces
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each referring page (default False)
 
         """
         return self.site.pagebacklinks(self,
                                          followRedirects=followRedirects,
                                          filterRedirects=filterRedirects,
                                          namespaces=namespaces, step=step,
-                                         total=total)
+                                         total=total, content=content)
 
     def embeddedin(self, filter_redirects=None, namespaces=None, step=None,
-                   total=None):
+                   total=None, content=False):
         """Return an iterator for pages that embed this page as a template.
 
         @param filterRedirects: if True, only iterate redirects; if False,
@@ -620,12 +572,15 @@
         @param namespaces: only iterate pages in these namespaces
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each embedding page (default False)
 
         """
         return self.site.page_embeddedin(self,
                                            filterRedirects=filter_redirects,
                                            namespaces=namespaces,
-                                           step=step, total=total)
+                                           step=step, total=total,
+                                           content=content)
 
     def canBeEdited(self):
         """Return bool indicating whether this page can be edited.
@@ -790,7 +745,8 @@
         """
         return self.site.watchpage(self, unwatch)
 
-    def linkedPages(self, namespaces=None, step=None, total=None):
+    def linkedPages(self, namespaces=None, step=None, total=None,
+                    content=False):
         """Iterate Pages that this Page links to.
 
         Only returns pages from "normal" internal links. Image and category
@@ -801,11 +757,13 @@
         @param namespaces: only iterate links in these namespaces
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each linked page (default False)
         @return: a generator that yields Page objects.
 
         """
         return self.site.pagelinks(self, namespaces=namespaces, step=step,
-                                     total=total)
+                                   total=total, content=content)
 
     def interwiki(self, expand=True):
         """Iterate interwiki links in the page text, excluding language links.
@@ -866,21 +824,24 @@
         # iterated upon.
         return self.site.pagelanglinks(self, step=step, total=total)
 
-    def templates(self):
+    def templates(self, content=False):
         """Return a list of Page objects for templates used on this Page.
 
         Template parameters are ignored.  This method only returns embedded
         templates, not template pages that happen to be referenced through
         a normal link.
 
+        @param content: if True, retrieve the content of the current version
+            of each template (default False)
+
         """
         # Data might have been preloaded
         if not hasattr(self, '_templates'):
-            self._templates = list(self.itertemplates())
+            self._templates = list(self.itertemplates(content=content))
 
         return self._templates
 
-    def itertemplates(self, step=None, total=None):
+    def itertemplates(self, step=None, total=None, content=False):
         """Iterate Page objects for templates used on this Page.
 
         Template parameters are ignored.  This method only returns embedded
@@ -889,23 +850,29 @@
 
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each template (default False)
 
         """
         if hasattr(self, '_templates'):
             return iter(self._templates)
-        return self.site.pagetemplates(self, step=step, total=total)
+        return self.site.pagetemplates(self, step=step, total=total,
+                                       content=content)
 
     @deprecate_arg("followRedirects", None)
     @deprecate_arg("loose", None)
-    def imagelinks(self, step=None, total=None):
+    def imagelinks(self, step=None, total=None, content=False):
         """Iterate ImagePage objects for images displayed on this Page.
 
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each image description page (default False)
         @return: a generator that yields ImagePage objects.
 
         """
-        return self.site.pageimages(self, step=step, total=total)
+        return self.site.pageimages(self, step=step, total=total,
+                                    content=content)
 
     def templatesWithParams(self):
         """Iterate templates used on this Page.
@@ -948,17 +915,20 @@
 
     @deprecate_arg("nofollow_redirects", None)
     @deprecate_arg("get_redirect", None)
-    def categories(self, withSortKey=False, step=None, total=None):
+    def categories(self, withSortKey=False, step=None, total=None,
+                   content=False):
         """Iterate categories that the article is in.
 
         @param withSortKey: if True, include the sort key in each Category.
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, retrieve the content of the current version
+            of each category description page (default False)
         @return: a generator that yields Category objects.
 
         """
         return self.site.pagecategories(self, withSortKey=withSortKey,
-                                          step=step, total=total)
+                                        step=step, total=total, 
content=content)
 
     def extlinks(self, step=None, total=None):
         """Iterate all external URLs (not interwiki links) from this page.
@@ -992,8 +962,7 @@
         Return value is a list of tuples, where each tuple represents one
         edit and is built of revision id, edit date/time, user name, and
         edit summary. Starts with the most current revision, unless
-        reverseOrder is True. Defaults to getting the first revCount edits,
-        unless getAll is True.
+        reverseOrder is True.
 
         @param step: limit each API call to this number of revisions
         @param total: iterate no more than this number of revisions in total
@@ -1490,14 +1459,17 @@
                          % (datetime, username, resolution, size, comment))
         return u'{| border="1"\n! date/time || username || resolution || size 
|| edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}'
 
-    def usingPages(self, step=None, total=None):
+    def usingPages(self, step=None, total=None, content=False):
         """Yield Pages on which the image is displayed.
 
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in total
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
-        return self.site.imageusage(self, step=step, total=total)
+        return self.site.imageusage(self,
+                         step=step, total=total, content=content)
 
 
 class Category(Page):
@@ -1537,7 +1509,8 @@
 
     @deprecate_arg("startFrom", None)
     @deprecate_arg("cacheResults", None)
-    def subcategories(self, recurse=False, step=None, total=None):
+    def subcategories(self, recurse=False, step=None, total=None,
+                      content=False):
         """Iterate all subcategories of the current category.
 
         @param recurse: if not False or 0, also iterate subcategories of
@@ -1548,14 +1521,17 @@
         @param step: limit each API call to this number of categories
         @param total: iterate no more than this number of
             subcategories in total (at all levels)
+        @param content: if True, retrieve the content of the current version
+            of each category description page (default False)
 
         """
         if not isinstance(recurse, bool) and recurse:
             recurse = recurse - 1
         if not hasattr(self, "_subcats"):
             self._subcats = []
-            for member in self.site.categorymembers(self, namespaces=[14],
-                                                      step=step, total=total):
+            for member in self.site.categorymembers(self,
+                                    namespaces=[14], step=step, total=total,
+                                    content=content):
                 subcat = Category(self.site, member.title())
                 self._subcats.append(subcat)
                 yield subcat
@@ -1564,8 +1540,9 @@
                     if not total:
                         return
                 if recurse:
-                    for item in subcat.subcategories(recurse,
-                                                     step=step, total=total):
+                    for item in subcat.subcategories(
+                                       recurse, step=step, total=total,
+                                       content=content):
                         yield item
                         if total is not None:
                             total -= 1
@@ -1579,8 +1556,9 @@
                     if not total:
                         return
                 if recurse:
-                    for item in subcat.subcategories(recurse,
-                                                     step=step, total=total):
+                    for item in subcat.subcategories(
+                                       recurse, step=step, total=total,
+                                       content=content):
                         yield item
                         if total is not None:
                             total -= 1
@@ -1588,7 +1566,7 @@
                                 return
 
     @deprecate_arg("startFrom", None)
-    def articles(self, recurse=False, step=None, total=None):
+    def articles(self, recurse=False, step=None, total=None, content=False):
         """
         Yields all articles in the current category.
 
@@ -1600,13 +1578,16 @@
         @param step: limit each API call to this number of pages
         @param total: iterate no more than this number of pages in
             total (at all levels)
+        @param content: if True, retrieve the content of the current version
+            of each page (default False)
 
         """
         namespaces = [x for x in self.site.namespaces()
                       if x>=0 and x!=14]
         for member in self.site.categorymembers(self,
-                                                  namespaces=namespaces,
-                                                  step=step, total=total):
+                                                namespaces=namespaces,
+                                                step=step, total=total,
+                                                content=content):
             yield member
             if total is not None:
                 total -= 1
@@ -1616,18 +1597,22 @@
             if not isinstance(recurse, bool) and recurse:
                 recurse = recurse - 1
             for subcat in self.subcategories(step=step):
-                for article in subcat.articles(recurse, step=step, 
total=total):
+                for article in subcat.articles(
+                                      recurse, step=step, total=total,
+                                      content=content):
                     yield article
                     if total is not None:
                         total -= 1
                         if not total:
                             return
 
-    def members(self, recurse=False, namespaces=None, step=None, total=None):
+    def members(self, recurse=False, namespaces=None, step=None, total=None,
+                content=False):
         """Yield all category contents (subcats, pages, and files)."""
 
-        for member in self.site.categorymembers(self, namespaces,
-                                                  step=step, total=total):
+        for member in self.site.categorymembers(self,
+                                namespaces, step=step, total=total,
+                                content=content):
             yield member
             if total is not None:
                 total -= 1
@@ -1637,8 +1622,9 @@
             if not isinstance(recurse, bool) and recurse:
                 recurse = recurse - 1
             for subcat in self.subcategories(step=step):
-                for article in subcat.members(recurse, namespaces, step=step,
-                                              total=total):
+                for article in subcat.members(
+                                      recurse, namespaces, step=step,
+                                      total=total, content=content):
                     yield article
                     if total is not None:
                         total -= 1

Modified: branches/rewrite/pywikibot/pagegenerators.py
===================================================================
--- branches/rewrite/pywikibot/pagegenerators.py        2010-06-24 19:00:35 UTC 
(rev 8323)
+++ branches/rewrite/pywikibot/pagegenerators.py        2010-06-24 19:09:47 UTC 
(rev 8324)
@@ -199,7 +199,7 @@
             gensList = CombinedPageGenerator(self.gens)
         return DuplicateFilterPageGenerator(gensList)
 
-    def getCategoryGen(self, arg, length, recurse = False):
+    def getCategoryGen(self, arg, length, recurse=False, content=False):
         if len(arg) == length:
             categoryname = pywikibot.input(u'Please enter the category name:')
         else:
@@ -215,9 +215,10 @@
                                                 defaultNamespace=14))
         # Link constructor automatically prepends localized namespace
         # if not included in user's input
-        return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse)
+        return CategorizedPageGenerator(cat,
+               start=startfrom, recurse=recurse, content=content)
 
-    def setSubCategoriesGen(self, arg, length, recurse=False):
+    def setSubCategoriesGen(self, arg, length, recurse=False, content=False):
         if len(arg) == length:
             categoryname = pywikibot.input(u'Please enter the category name:')
         else:
@@ -232,7 +233,8 @@
 
         cat = pywikibot.Category(pywikibot.Link(categoryname,
                                                 defaultNamespace=14))
-        return SubCategoriesPageGenerator(cat, start=startfrom, 
recurse=recurse)
+        return SubCategoriesPageGenerator(cat,
+               start=startfrom, recurse=recurse, content=content)
 
     def handleArg(self, arg):
         """Parse one argument at a time.
@@ -447,7 +449,7 @@
 
 
 def AllpagesPageGenerator(start='!', namespace=0, includeredirects=True,
-                          site=None, step=None, total=None):
+                          site=None, step=None, total=None, content=False):
     """
     Iterate Page objects for all titles in a single namespace.
 
@@ -456,6 +458,7 @@
 
     @param step: Maximum number of pages to retrieve per API query
     @param total: Maxmum number of pages to retrieve in total
+    @param content: If True, load current version of each page (default False)
 
     """
     if site is None:
@@ -468,11 +471,12 @@
     else:
         filterredir = False
     return site.allpages(start=start, namespace=namespace,
-                         filterredir=filterredir, step=step, total=total)
+                         filterredir=filterredir, step=step, total=total,
+                         content=content)
 
 
 def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
-                           site=None, step=None, total=None):
+                           site=None, step=None, total=None, content=False):
     if site is None:
         site = pywikibot.Site()
     prefixlink = pywikibot.Link(prefix, site)
@@ -487,8 +491,10 @@
     else:
         filterredir = False
     return site.allpages(prefix=title, namespace=namespace,
-                         filterredir=filterredir, step=step, total=total)
+                         filterredir=filterredir, step=step, total=total,
+                         content=content)
 
+
 @deprecate_arg("number", "total")
 @deprecate_arg("namespace", "namespaces")
 @deprecate_arg("repeat", None)
@@ -507,6 +513,7 @@
                                    step=step, total=total):
         yield pywikibot.Page(pywikibot.Link(item["title"], site))
 
+
 def RecentChangesPageGenerator(start=None, end=None, reverse=False,
                                namespaces=None, pagelist=None,
                                changetype=None, showMinor=None,
@@ -547,14 +554,15 @@
                                    step=step, total=total):
         yield pywikibot.Page(pywikibot.Link(item["title"], site))
 
-def FileLinksGenerator(referredImagePage, step=None, total=None):
-    return referredImagePage.usingPages(step=step, total=total)
 
+def FileLinksGenerator(referredImagePage, step=None, total=None, 
content=False):
+    return referredImagePage.usingPages(step=step, total=total, 
content=content)
 
-def ImagesPageGenerator(pageWithImages, step=None, total=None):
-    return pageWithImages.imagelinks(step=step, total=total)
 
+def ImagesPageGenerator(pageWithImages, step=None, total=None, content=False):
+    return pageWithImages.imagelinks(step=step, total=total, content=content)
 
+
 def InterwikiPageGenerator(page):
     """Iterator over all interwiki (non-language) links on a page."""
     for link in page.interwiki():
@@ -570,54 +578,66 @@
 def ReferringPageGenerator(referredPage, followRedirects=False,
                            withTemplateInclusion=True,
                            onlyTemplateInclusion=False,
-                           step=None, total=None):
+                           step=None, total=None, content=False):
     '''Yields all pages referring to a specific page.'''
     return referredPage.getReferences(
                 follow_redirects=followRedirects,
                 withTemplateInclusion=withTemplateInclusion,
                 onlyTemplateInclusion=onlyTemplateInclusion,
-                step=step, total=total)
+                step=step, total=total, content=content)
 
 
 def CategorizedPageGenerator(category, recurse=False, start=None,
-                             step=None, total=None):
-    '''
-    Yields all pages in a specific category.
+                             step=None, total=None, content=False):
+    """Yield all pages in a specific category.
 
     If recurse is True, pages in subcategories are included as well; if
     recurse is an int, only subcategories to that depth will be included
     (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
     not go any further).
+
     If start is a string value, only pages whose sortkey comes after start
     alphabetically are included.
-    '''
+
+    If content is True (default is False), the current page text of each
+    retrieved page will be downloaded.
+
+    """
     # TODO: page generator could be modified to use cmstartsortkey ...
-    for a in category.articles(recurse=recurse, step=step, total=total):
+    for a in category.articles(
+                      recurse=recurse, step=step, total=total, 
content=content):
         if start is None or a.title(withNamespace=False) >= start:
             yield a
 
+
 def SubCategoriesPageGenerator(category, recurse=False, start=None,
-                               step=None, total=None):
-    '''
-    Yields all subcategories in a specific category.
+                               step=None, total=None, content=False):
+    """Yield all subcategories in a specific category.
 
     If recurse is True, pages in subcategories are included as well; if
     recurse is an int, only subcategories to that depth will be included
     (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
     not go any further).
+
     If start is a string value, only categories whose sortkey comes after
     start alphabetically are included.
-    '''
+
+    If content is True (default is False), the current page text of each
+    category description page will be downloaded.
+
+    """
     # TODO: page generator could be modified to use cmstartsortkey ...
-    for s in category.subcategories(recurse=recurse, step=step, total=total):
+    for s in category.subcategories(
+                      recurse=recurse, step=step, total=total, 
content=content):
         if start is None or s.title(withNamespace=False) >= start:
             yield s
 
-def LinkedPageGenerator(linkingPage, step=None, total=None):
-    """Yields all pages linked from a specific page."""
-    return linkingPage.linkedPages(step=step, total=total)
 
+def LinkedPageGenerator(linkingPage, step=None, total=None, content=False):
+    """Yield all pages linked from a specific page."""
+    return linkingPage.linkedPages(step=step, total=total, content=content)
 
+
 def TextfilePageGenerator(filename=None, site=None):
     """Iterate pages from a list in a text file.
 
@@ -643,6 +663,7 @@
         yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site))
     f.close()
 
+
 def PagesFromTitlesGenerator(iterable, site=None):
     """Generate pages from the titles (unicode strings) yielded by iterable."""
     if site is None:
@@ -744,11 +765,12 @@
 
 
 def PageWithTalkPageGenerator(generator):
+    """Yield pages and associated talk pages from another generator.
+
+    Only yields talk pages if the original generator yields a non-talk page,
+    and does not check if the talk page in fact exists.
+
     """
-    Wraps around another generator. Yields the same pages, but for non-talk
-    pages, it also includes associated talk pages.
-    This generator does not check if the talk page in fact exists.
-    """
     for page in generator:
         yield page
         if not page.isTalkPage():

Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py  2010-06-24 19:00:35 UTC (rev 8323)
+++ branches/rewrite/pywikibot/site.py  2010-06-24 19:09:47 UTC (rev 8324)
@@ -143,8 +143,12 @@
 
     @property
     def code(self):
-        """The identifying code for this Site."""
+        """The identifying code for this Site.
 
+        By convention, this is usually an ISO language code, but it does
+        not have to be.
+
+        """
         return self.__code
 
     @property
@@ -194,7 +198,7 @@
                                  % (self.__class__.__name__, attr)  )
 
     def sitename(self):
-        """Return string representing this Site's name and language."""
+        """Return string representing this Site's name and code."""
 
         return self.family.name+':'+self.code
 
@@ -270,10 +274,12 @@
 
     def pagenamecodes(self, default=True):
         """Return list of localized PAGENAME tags for the site."""
+
         return [u"PAGENAME"]
 
     def pagename2codes(self, default=True):
         """Return list of localized PAGENAMEE tags for the site."""
+
         return [u"PAGENAMEE"]
 
     def lock_page(self, page, block=True):
@@ -330,14 +336,14 @@
         """
         return pywikibot.Link(title, self).astext(othersite)
 
-    def isInterwikiLink(self, s):
-        """Return True if s is in the form of an interwiki link.
+    def isInterwikiLink(self, text):
+        """Return True if text is in the form of an interwiki link.
 
-        If a link object constructed using "s" as the link text parses as
+        If a link object constructed using "text" as the link text parses as
         belonging to a different site, this method returns True.
 
         """
-        linkfam, linkcode = pywikibot.Link(s, self).parse_site()
+        linkfam, linkcode = pywikibot.Link(text, self).parse_site()
         return (linkfam != self.family.name or linkcode != self.code)
 
     def redirectRegex(self, pattern=None):
@@ -413,6 +419,7 @@
 
     def nice_get_address(self, title):
         """Return shorter URL path to retrieve page titled 'title'."""
+
         return self.family.nice_get_address(self.lang, title)
 
     # deprecated methods for backwards-compatibility
@@ -420,6 +427,7 @@
     @deprecated("family attribute")
     def fam(self):
         """Return Family object for this Site."""
+
         return self.family
 
     @deprecated("urllib.urlencode()")
@@ -630,14 +638,6 @@
         self._loginstatus = -3
         return
 
-    # ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__()
-        # Calculating valid languages took quite long, so we calculate it once
-        # in initialization instead of each time it is used.
-        self._validlanguages = []
-        for language in self.languages():
-            if not language[:1].upper() + language[1:] in self.namespaces():
-                self._validlanguages.append(language)
-
     def _generator(self, gen_class, type_arg=None, namespaces=None,
                    step=None, total=None, **args):
         """Convenience method that returns an API generator.
@@ -1048,16 +1048,15 @@
         @param history: if true, return the image's version history
 
         """
-        title = page.title(withSection=False)
+        args = {"title": page.title(withSection=False)}
+        if history:
+            args["iilimit"] = "max"
         query = self._generator(api.PropertyGenerator,
                                 type_arg="imageinfo",
-                                titles=title.encode(self.encoding()),
                                 iiprop=["timestamp", "user", "comment",
                                         "url", "size", "sha1", "mime",
-                                        "metadata", "archivename"]
-                               )
-        if history:
-            query.request["iilimit"] = "max"
+                                        "metadata", "archivename"],
+                                **args)
         for pageitem in query:
             if pageitem['title'] != title:
                 raise Error(
@@ -1220,7 +1219,7 @@
     # following group of methods map more-or-less directly to API queries
 
     def pagebacklinks(self, page, followRedirects=False, filterRedirects=None,
-                      namespaces=None, step=None, total=None):
+                      namespaces=None, step=None, total=None, content=False):
         """Iterate all pages that link to the given page.
 
         @param page: The Page to get links to.
@@ -1233,15 +1232,18 @@
             in this list.
         @param step: Limit on number of pages to retrieve per API query.
         @param total: Maximum number of pages to retrieve in total.
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
         bltitle = page.title(withSection=False).encode(self.encoding())
+        blargs = {"gbltitle": bltitle}
+        if filterRedirects is not None:
+            blargs["gblfilterredir"] = filterRedirects and "redirects" \
+                                                        or "nonredirects"
         blgen = self._generator(api.PageGenerator, type_arg="backlinks",
-                                gbltitle=bltitle, namespaces=namespaces,
-                                step=step, total=total)
-        if filterRedirects is not None:
-            blgen.request["gblfilterredir"] = filterRedirects and "redirects"\
-                                                              or "nonredirects"
+                                namespaces=namespaces, step=step, total=total,
+                                g_content=content, **blargs)
         if followRedirects:
             # bug: see http://bugzilla.wikimedia.org/show_bug.cgi?id=7304
             # links identified by MediaWiki as redirects may not really be,
@@ -1265,13 +1267,14 @@
                     genlist[redir.title()] = self.pagebacklinks(
                                                 redir, followRedirects=True,
                                                 
filterRedirects=filterRedirects,
-                                                namespaces=namespaces)
+                                                namespaces=namespaces,
+                                                content=content)
             import itertools
             return itertools.chain(*genlist.values())
         return blgen
 
     def page_embeddedin(self, page, filterRedirects=None, namespaces=None,
-                        step=None, total=None):
+                        step=None, total=None, content=False):
         """Iterate all pages that embedded the given page as a template.
 
         @param page: The Page to get inclusions for.
@@ -1280,93 +1283,121 @@
             None, return both (no filtering).
         @param namespaces: If present, only return links from the namespaces
             in this list.
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
-        eititle = page.title(withSection=False).encode(self.encoding())
+        eiargs = {"geititle":
+                page.title(withSection=False).encode(self.encoding())}
+        if filterRedirects is not None:
+            eiargs["geifilterredir"] = filterRedirects and "redirects"\
+                                                        or "nonredirects"
         eigen = self._generator(api.PageGenerator, type_arg="embeddedin",
-                                geititle=eititle, namespaces=namespaces,
-                                step=step, total=total)
-        if filterRedirects is not None:
-            eigen.request["geifilterredir"] = filterRedirects and "redirects"\
-                                                              or "nonredirects"
+                                namespaces=namespaces, step=step, total=total,
+                                g_content=content, **eiargs)
         return eigen
 
     def pagereferences(self, page, followRedirects=False, filterRedirects=None,
                        withTemplateInclusion=True, onlyTemplateInclusion=False,
-                       namespaces=None, step=None, total=None):
+                       namespaces=None, step=None, total=None, content=False):
         """Convenience method combining pagebacklinks and page_embeddedin."""
 
         if onlyTemplateInclusion:
             return self.page_embeddedin(page, namespaces=namespaces,
                                         filterRedirects=filterRedirects,
-                                        step=step, total=total)
+                                        step=step, total=total, 
content=content)
         if not withTemplateInclusion:
             return self.pagebacklinks(page, followRedirects=followRedirects,
-                                      filterRedirects=filterRedirects,         
                             namespaces=namespaces,
-                                      step=step, total=total)
+                                      filterRedirects=filterRedirects,
+                                      namespaces=namespaces,
+                                      step=step, total=total, content=content)
         import itertools
         return itertools.islice(
                     itertools.chain(
                         self.pagebacklinks(
                             page, followRedirects, filterRedirects,
-                            namespaces=namespaces, step=step),
+                            namespaces=namespaces, step=step, content=content),
                         self.page_embeddedin(
                             page, filterRedirects, namespaces=namespaces,
-                            step=step)
+                            step=step, content=content)
                         ),
                     total)
 
     def pagelinks(self, page, namespaces=None, follow_redirects=False,
-                  step=None, total=None):
+                  step=None, total=None, content=False):
         """Iterate internal wikilinks contained (or transcluded) on page.
 
         @param namespaces: Only iterate pages in these namespaces (default: 
all)
         @type namespaces: list of ints
         @param follow_redirects: if True, yields the target of any redirects,
             rather than the redirect page
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
-        plgen = self._generator(api.PageGenerator, type_arg="links",
-                                namespaces=namespaces, step=step, total=total)
+        plargs = {}
         if hasattr(page, "_pageid"):
-            plgen.request['pageids'] = str(page._pageid)
+            plargs['pageids'] = str(page._pageid)
         else:
             pltitle = page.title(withSection=False).encode(self.encoding())
-            plgen.request['titles'] = pltitle
+            plargs['titles'] = pltitle
         if follow_redirects:
-            plgen.request['redirects'] = ''
+            plargs['redirects'] = ''
+        plgen = self._generator(api.PageGenerator, type_arg="links",
+                                namespaces=namespaces, step=step, total=total,
+                                g_content=content, **plargs)
         return plgen
 
     @deprecate_arg("withSortKey", None) # Sortkey doesn't work with generator
-    def pagecategories(self, page, step=None, total=None):
-        """Iterate categories to which page belongs."""
+    def pagecategories(self, page, step=None, total=None, content=False):
+        """Iterate categories to which page belongs.
 
-        clgen = self._generator(api.CategoryPageGenerator,
-                                type_arg="categories", step=step, total=total)
+        @param content: if True, load the current content of each iterated page
+            (default False); note that this means the contents of the
+            category description page, not the pages contained in the category
+
+        """
+        clargs = {}
         if hasattr(page, "_pageid"):
-            clgen.request['pageids'] = str(page._pageid)
+            clargs['pageids'] = str(page._pageid)
         else:
-            cltitle = page.title(withSection=False).encode(self.encoding())
-            clgen.request['titles'] = cltitle
+            clargs['titles'] = page.title(withSection=False
+                                         ).encode(self.encoding())
+        clgen = self._generator(api.CategoryPageGenerator,
+                                type_arg="categories", step=step, total=total,
+                                g_content=content, **clargs)
         return clgen
 
-    def pageimages(self, page, step=None, total=None):
-        """Iterate images used (not just linked) on the page."""
+    def pageimages(self, page, step=None, total=None, content=False):
+        """Iterate images used (not just linked) on the page.
 
+        @param content: if True, load the current content of each iterated page
+            (default False); note that this means the content of the image
+            description page, not the image itself
+
+        """
+        imtitle = page.title(withSection=False).encode(self.encoding())
         imgen = self._generator(api.ImagePageGenerator, type_arg="images",
-                                titles=imtitle, step=step, total=total)
+                                titles=imtitle, step=step, total=total,
+                                g_content=content)
         return imgen
 
-    def pagetemplates(self, page, namespaces=None, step=None, total=None):
-        """Iterate templates transcluded (not just linked) on the page."""
+    def pagetemplates(self, page, namespaces=None, step=None, total=None,
+                      content=False):
+        """Iterate templates transcluded (not just linked) on the page.
 
+        @param content: if True, load the current content of each iterated page
+            (default False)
+
+        """
         tltitle = page.title(withSection=False).encode(self.encoding())
         tlgen = self._generator(api.PageGenerator, type_arg="templates",
                                 titles=tltitle, namespaces=namespaces,
-                                step=step, total=total)
+                                step=step, total=total, g_content=content)
         return tlgen
 
-    def categorymembers(self, category, namespaces=None, step=None, 
total=None):
+    def categorymembers(self, category, namespaces=None, step=None, total=None,
+                        content=False):
         """Iterate members of specified category.
 
         @param category: The Category to iterate.
@@ -1376,6 +1407,8 @@
             however, that the iterated values are always Page objects, even
             if in the Category or Image namespace.
         @type namespaces: list of ints
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
         if category.namespace() != 14:
@@ -1387,9 +1420,10 @@
                                 type_arg="categorymembers",
                                 gcmtitle=cmtitle,
                                 gcmprop="ids|title|sortkey",
-#                                namespaces=namespaces,
+#                                namespaces=namespaces, # see note below
                                 step=step,
-                                total=total)
+                                total=total,
+                                g_content=content)
 #       workaround for https://bugzilla.wikimedia.org/show_bug.cgi?id=19640:
         if namespaces:
             if not isinstance(namespaces, list):
@@ -1587,7 +1621,7 @@
     def allpages(self, start="!", prefix="", namespace=0, filterredir=None,
                  filterlanglinks=None, minsize=None, maxsize=None,
                  protect_type=None, protect_level=None, reverse=False,
-                 includeredirects=None, step=None, total=None):
+                 includeredirects=None, step=None, total=None, content=False):
         """Iterate pages in a single namespace.
 
         Note: parameters includeRedirects and throttle are deprecated and
@@ -1614,6 +1648,8 @@
         @param reverse: if True, iterate in reverse Unicode lexigraphic
             order (default: iterate in forward order)
         @param includeredirects: DEPRECATED, use filterredirs instead
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
         if not isinstance(namespace, int):
@@ -1632,7 +1668,8 @@
 
         apgen = self._generator(api.PageGenerator, type_arg="allpages",
                                 gapnamespace=str(namespace),
-                                gapfrom=start, step=step, total=total)
+                                gapfrom=start, step=step, total=total,
+                                g_content=content)
         if prefix:
             apgen.request["gapprefix"] = prefix
         if filterredir is not None:
@@ -1705,7 +1742,7 @@
             yield p
 
     def allcategories(self, start="!", prefix="", step=None, total=None,
-                      reverse=False):
+                      reverse=False, content=False):
         """Iterate categories used (which need not have a Category page).
 
         Iterator yields Category objects. Note that, in practice, links that
@@ -1716,11 +1753,14 @@
         @param prefix: Only yield categories starting with this string.
         @param reverse: if True, iterate in reverse Unicode lexigraphic
             order (default: iterate in forward order)
+        @param content: if True, load the current content of each iterated page
+            (default False); note that this means the contents of the category
+            description page, not the pages that are members of the category
 
         """
         acgen = self._generator(api.CategoryPageGenerator,
                                 type_arg="allcategories", gacfrom=start,
-                                step=step, total=total)
+                                step=step, total=total, g_content=content)
         if prefix:
             acgen.request["gacprefix"] = prefix
         if reverse:
@@ -1763,7 +1803,7 @@
 
     def allimages(self, start="!", prefix="", minsize=None, maxsize=None,
                   reverse=False, sha1=None, sha1base36=None, step=None,
-                  total=None):
+                  total=None, content=False):
         """Iterate all images, ordered by image title.
 
         Yields ImagePages, but these pages need not exist on the wiki.
@@ -1776,11 +1816,14 @@
         @param sha1: only iterate image (it is theoretically possible there
             could be more than one) with this sha1 hash
         @param sha1base36: same as sha1 but in base 36
+        @param content: if True, load the current content of each iterated page
+            (default False); note that this means the content of the image
+            description page, not the image itself
 
         """
         aigen = self._generator(api.ImagePageGenerator,
                                 type_arg="allimages", gaifrom=start,
-                                step=step, total=total)
+                                step=step, total=total, g_content=content)
         if prefix:
             aigen.request["gaiprefix"] = prefix
         if isinstance(minsize, int):
@@ -1837,7 +1880,7 @@
         return bkgen
 
     def exturlusage(self, url, protocol="http", namespaces=None,
-                    step=None, total=None):
+                    step=None, total=None, content=False):
         """Iterate Pages that contain links to the given URL.
 
         @param url: The URL to search for (without the protocol prefix);
@@ -1849,11 +1892,11 @@
         eugen = self._generator(api.PageGenerator, type_arg="exturlusage",
                                 geuquery=url, geuprotocol=protocol,
                                 namespaces=namespaces, step=step,
-                                total=total)
+                                total=total, g_content=content)
         return eugen
 
     def imageusage(self, image, namespaces=None, filterredir=None,
-                   step=None, total=None):
+                   step=None, total=None, content=False):
         """Iterate Pages that contain links to the given ImagePage.
 
         @param image: the image to search for (ImagePage need not exist on
@@ -1861,6 +1904,8 @@
         @type image: ImagePage
         @param filterredir: if True, only yield redirects; if False (and not
             None), only yield non-redirects (default: yield both)
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
         iuargs = dict(giutitle=image.title(withSection=False))
@@ -1869,7 +1914,7 @@
                                                      or "nonredirects")
         iugen = self._generator(api.PageGenerator, type_arg="imageusage",
                                 namespaces=namespaces, step=step,
-                                total=total, **iuargs)
+                                total=total, g_content=content, **iuargs)
         return iugen
 
     def logevents(self, logtype=None, user=None, page=None,
@@ -1984,7 +2029,7 @@
 
     @deprecate_arg("number", "limit")
     def search(self, searchstring, namespaces=None, where="text",
-               getredirects=False, step=None, total=None):
+               getredirects=False, step=None, total=None, content=False):
         """Iterate Pages that contain the searchstring.
 
         Note that this may include non-existing Pages if the wiki's database
@@ -1997,6 +2042,8 @@
         @param namespaces: search only in these namespaces (defaults to 0)
         @type namespaces: list of ints
         @param getredirects: if True, include redirects in results
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
         if not searchstring:
@@ -2009,7 +2056,7 @@
         srgen = self._generator(api.PageGenerator, type_arg="search",
                                 gsrsearch=searchstring, gsrwhat=where,
                                 namespaces=namespaces, step=step,
-                                total=total)
+                                total=total, g_content=content)
         if getredirects:
             srgen.request["gsrredirects"] = ""
         return srgen
@@ -2188,7 +2235,7 @@
         return usgen
 
     def randompages(self, step=None, total=1, namespaces=None,
-                    redirects=False):
+                    redirects=False, content=False):
         """Iterate a number of random pages.
 
         Pages are listed in a fixed sequence, only the starting point is
@@ -2198,10 +2245,13 @@
         @param namespaces: only iterate pages in these namespaces.
         @param redirects: if True, include only redirect pages in results
             (default: include only non-redirects)
+        @param content: if True, load the current content of each iterated page
+            (default False)
 
         """
         rngen = self._generator(api.PageGenerator, type_arg="random",
-                                namespaces=namespaces, step=step, total=total)
+                                namespaces=namespaces, step=step, total=total,
+                                g_content=content)
         if redirects:
             rngen.request["grnredirect"] = ""
         return rngen



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [8324] branches/rewrite/pywikibot

Reply via email to