Revision: 6134
Author:   russblau
Date:     2008-12-08 21:41:15 +0000 (Mon, 08 Dec 2008)

Log Message:
-----------
Interwiki link scraping

Modified Paths:
--------------
    branches/rewrite/pywikibot/data/api.py
    branches/rewrite/pywikibot/page.py
    branches/rewrite/pywikibot/site.py
    branches/rewrite/pywikibot/tests/page_tests.py

Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py      2008-12-05 22:08:29 UTC (rev 
6133)
+++ branches/rewrite/pywikibot/data/api.py      2008-12-08 21:41:15 UTC (rev 
6134)
@@ -184,7 +184,7 @@
             self.site.throttle(write=write)
             uri = self.site.scriptpath() + "/api.php"
             try:
-                if write or action == "login":
+                if write or action in ("login", "expandtemplates", "parse"):
                     # add other actions that require POST requests above
                     rawdata = http.request(self.site, uri, method="POST",
                                 headers={'Content-Type':

Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py  2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/page.py  2008-12-08 21:41:15 UTC (rev 6134)
@@ -368,6 +368,15 @@
     text = property(_textgetter, _textsetter, _cleartext,
                     "The edited wikitext (unicode) of this Page")
 
+    def expand_text(self):
+        """Return the page text with all templates expanded."""
+        req = pywikibot.data.api.Request(action="expandtemplates",
+                                         text=self.text,
+                                         title=self.title(withSection=False),
+                                         site=self.site())
+        result = req.submit()
+        return result["expandtemplates"]["*"]
+
     def userName(self):
         """Return name or IP address of last user to edit page."""
         return self._revisions[self.latestRevision()].user
@@ -686,8 +695,8 @@
         """Iterate Pages that this Page links to.
 
         Only returns pages from "normal" internal links. Image and category
-        links are omitted unless prefixed with ":"; embedded templates are
-        omitted (but links within them are returned); all interwiki and
+        links are omitted unless prefixed with ":". Embedded templates are
+        omitted (but links within them are returned). All interwiki and
         external links are omitted.
 
         @return: a generator that yields Page objects.
@@ -695,20 +704,43 @@
         """
         return self.site().pagelinks(self)
 
-    def interwiki(self):
-        """Iterate interwiki links in the page text.
+    def interwiki(self, expand=True):
+        """Iterate interwiki links in the page text, excluding language links.
 
-        @return: a generator that yields Link objects.
+        @param expand: if True (default), include interwiki links found in
+            templates transcluded onto this page; if False, only iterate
+            interwiki links found in this page's own wikitext
+        @return: a generator that yields Link objects
 
         """
-        return self.site().pageinterwiki(self)
+        # This function does not exist in the API, so it has to be
+        # implemented by screen-scraping
+        Rlink = re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
+        if expand:
+            text = self.expand_text()
+        else:
+            text = self.text
+        for linkmatch in Rlink.finditer(
+                            pywikibot.textlib.removeDisabledParts(text)):
+            linktitle = linkmatch.group("title")
+            link = Link(linktitle, self.site())
+            # only yield links that are to a different site and that
+            # are not language links
+            try:
+                if link.site != self.site():
+                    if linktitle.lstrip().startswith(":"):
+                        # initial ":" indicates not a language link
+                        yield link
+                    elif link.site.family != self.site().family:
+                        # link to a different family is not a language link
+                        yield link
+            except pywikibot.Error:
+                # ignore any links with invalid contents
+                continue
 
     def langlinks(self):
         """Iterate all interlanguage links on this page.
 
-        Note that the links yielded by this method will be a subset of
-        the results of self.interwiki().
-
         @return: a generator that yields Link objects.
 
         """
@@ -1729,7 +1761,25 @@
     def __str__(self):
         return self.astext()
 
+    def __cmp__(self, other):
+        """Test for equality and inequality of Link objects.
 
+        Link objects are "equal" if and only if they are on the same site
+        and have the same normalized title, including section if any.
+
+        Link objects are sortable by site, then namespace, then title.
+
+        """
+        if not isinstance(other, Link):
+            # especially, return -1 if other is None
+            return -1
+        if not self.site == other.site:
+            return cmp(self.site, other.site)
+        if self.namespace != other.namespace:
+            return cmp(self.namespace, other.namespace)
+        return cmp(self.title, other.title)
+
+
 # Utility functions for parsing page titles
 
 def html2unicode(text, ignore = []):
@@ -1794,7 +1844,7 @@
                 unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
             except KeyError:
                 pass
-            if unicodeCodepoint and unicodeCodepoint not in ignore and 
(WIDEBUILD or unicodeCodepoint < 65534):
+            if unicodeCodepoint and unicodeCodepoint not in ignore:
                 result += unichr(unicodeCodepoint)
             else:
                 # Leave the entity unchanged

Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py  2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/site.py  2008-12-08 21:41:15 UTC (rev 6134)
@@ -1357,7 +1357,7 @@
             api.update_page(page, pagedata)
 
     def pageinterwiki(self, page):
-        # TODO
+        # No such function in the API (this method isn't called anywhere)
         raise NotImplementedError
 
     def pagelanglinks(self, page):

Modified: branches/rewrite/pywikibot/tests/page_tests.py
===================================================================
--- branches/rewrite/pywikibot/tests/page_tests.py      2008-12-05 22:08:29 UTC 
(rev 6133)
+++ branches/rewrite/pywikibot/tests/page_tests.py      2008-12-08 21:41:15 UTC 
(rev 6134)
@@ -227,9 +227,12 @@
     def testLinks(self):
         for p in mainpage.linkedPages():
             self.assertTrue(isinstance(p, pywikibot.Page))
-## Not implemented:
-##        for p in mainpage.interwiki():
-##            self.assertTrue(isinstance(p, pywikibot.Link))
+        iw = list(mainpage.interwiki(expand=True))
+        for p in iw:
+            self.assertTrue(isinstance(p, pywikibot.Link))
+        for p2 in mainpage.interwiki(expand=False):
+            self.assertTrue(isinstance(p2, pywikibot.Link))
+            self.assertTrue(p2 in iw)            
         for p in mainpage.langlinks():
             self.assertTrue(isinstance(p, pywikibot.Link))
         for p in mainpage.imagelinks():



_______________________________________________
Pywikipedia-l mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-l

Reply via email to