wikipedia.py

alexsh Wed, 09 Dec 2009 05:59:06 -0800

Revision: 7751
Author:   alexsh
Date:     2009-12-09 13:58:53 +0000 (Wed, 09 Dec 2009)


Log Message:
-----------
* _GetAll(): add batch export page by API. (not enable for default until 
confirm stable)
*Page().botMayEdit(): fix lower case not to detect.

Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py      2009-12-09 13:51:54 UTC (rev 7750)
+++ trunk/pywikipedia/wikipedia.py      2009-12-09 13:58:53 UTC (rev 7751)
@@ -734,10 +734,10 @@
         params = {
             'action': 'query',
             'titles': self.title(),
-            'prop': ['revisions','info'],
-            'rvprop': 
['content','ids','flags','timestamp','user','comment','size'],
+            'prop': ['revisions', 'info'],
+            'rvprop': ['content', 'ids', 'flags', 'timestamp', 'user', 
'comment', 'size'],
             'rvlimit': 1,
-            'inprop': ['protection','talkid','subjectid','url','readable'],
+            'inprop': ['protection', 'talkid', 'subjectid', 'url', 'readable'],
             #'intoken': 'edit',
         }
         if oldid:
@@ -1128,9 +1128,9 @@
             return True
 
         for template in templates:
-            if template[0] == 'Nobots':
+            if template[0].lower() == 'nobots':
                 return False
-            elif template[0] == 'Bots':
+            elif template[0].lower() == 'bots':
                 if len(template[1]) == 0:
                     return True
                 else:
@@ -3761,54 +3761,89 @@
 
     def run(self):
         if self.pages:
-            while True:
-                try:
-                    data = self.getData()
-                except (socket.error, httplib.BadStatusLine, ServerError):
-                    # Print the traceback of the caught exception
-                    s = ''.join(traceback.format_exception(*sys.exc_info()))
-                    if not isinstance(s, unicode):
-                        s = s.decode('utf-8')
-                    output(u'%s\nDBG> got network error in _GetAll.run. ' \
-                            'Sleeping for %d seconds...' % (s, self.sleeptime))
-                    self.sleep()
-                else:
-                    if "<title>Wiki does not exist</title>" in data:
-                        raise NoSuchSite(u'Wiki %s does not exist yet' % 
self.site)
-                    elif "</mediawiki>" not in data[-20:]:
-                        # HTML error Page got thrown because of an internal
-                        # error when fetching a revision.
-                        output(u'Received incomplete XML data. ' \
-                            'Sleeping for %d seconds...' % self.sleeptime)
+            doAPI = None
+            #if config.use_api:
+            #    # API Implemented Check
+            #    try:
+            #        doAPI = True
+            #        d = self.site.api_address()
+            #        del d
+            #    except NotImplementedError:
+            #        doAPI = False
+            
+            if doAPI:
+                while True:
+                    try:
+                        data = self.getDataApi()
+                    except (socket.error, httplib.BadStatusLine, ServerError):
+                        # Print the traceback of the caught exception
+                        s = 
''.join(traceback.format_exception(*sys.exc_info()))
+                        if not isinstance(s, unicode):
+                            s = s.decode('utf-8')
+                        output(u'%s\nDBG> got network error in _GetAll.run. ' \
+                                'Sleeping for %d seconds...' % (s, 
self.sleeptime))
                         self.sleep()
-                    elif "<siteinfo>" not in data: # This probably means we 
got a 'temporary unaivalable'
-                        output(u'Got incorrect export page. ' \
-                            'Sleeping for %d seconds...' % self.sleeptime)
+                    else:
+                        if 'error' in data:
+                            raise RuntimeError(data['error'])
+                        else:
+                            break
+                
+                self.headerDoneApi(data['query'])
+                if 'normalized' in data['query']:
+                    self._norm = dict([(x['from'],x['to']) for x in 
data['query']['normalized']])
+                for vals in data['query']['pages'].values():
+                    self.oneDoneApi(vals)
+                
+            else:
+                while True:
+                    try:
+                        data = self.getData()
+                    except (socket.error, httplib.BadStatusLine, ServerError):
+                        # Print the traceback of the caught exception
+                        s = 
''.join(traceback.format_exception(*sys.exc_info()))
+                        if not isinstance(s, unicode):
+                            s = s.decode('utf-8')
+                        output(u'%s\nDBG> got network error in _GetAll.run. ' \
+                                'Sleeping for %d seconds...' % (s, 
self.sleeptime))
                         self.sleep()
                     else:
-                        break
-            R = re.compile(r"\s*<\?xml([^>]*)\?>(.*)",re.DOTALL)
-            m = R.match(data)
-            if m:
-                data = m.group(2)
-            handler = xmlreader.MediaWikiXmlHandler()
-            handler.setCallback(self.oneDone)
-            handler.setHeaderCallback(self.headerDone)
-            #f = open("backup.txt", "w")
-            #f.write(data)
-            #f.close()
-            try:
-                xml.sax.parseString(data, handler)
-            except (xml.sax._exceptions.SAXParseException, ValueError), err:
-                debugDump( 'SaxParseBug', self.site, err, data )
-                raise
-            except PageNotFound:
-                return
-            # All of the ones that have not been found apparently do not exist
+                        if "<title>Wiki does not exist</title>" in data:
+                            raise NoSuchSite(u'Wiki %s does not exist yet' % 
self.site)
+                        elif "</mediawiki>" not in data[-20:]:
+                            # HTML error Page got thrown because of an internal
+                            # error when fetching a revision.
+                            output(u'Received incomplete XML data. ' \
+                                'Sleeping for %d seconds...' % self.sleeptime)
+                            self.sleep()
+                        elif "<siteinfo>" not in data: # This probably means 
we got a 'temporary unaivalable'
+                            output(u'Got incorrect export page. ' \
+                                'Sleeping for %d seconds...' % self.sleeptime)
+                            self.sleep()
+                        else:
+                            break
+                R = re.compile(r"\s*<\?xml([^>]*)\?>(.*)",re.DOTALL)
+                m = R.match(data)
+                if m:
+                    data = m.group(2)
+                handler = xmlreader.MediaWikiXmlHandler()
+                handler.setCallback(self.oneDone)
+                handler.setHeaderCallback(self.headerDone)
+                #f = open("backup.txt", "w")
+                #f.write(data)
+                #f.close()
+                try:
+                    xml.sax.parseString(data, handler)
+                except (xml.sax._exceptions.SAXParseException, ValueError), 
err:
+                    debugDump( 'SaxParseBug', self.site, err, data )
+                    raise
+                except PageNotFound:
+                    return
+                # All of the ones that have not been found apparently do not 
exist
             for pl in self.pages:
                 if not hasattr(pl,'_contents') and not 
hasattr(pl,'_getexception'):
                     pl._getexception = NoPage
-
+    
     def oneDone(self, entry):
         title = entry.title
         username = entry.username
@@ -3841,7 +3876,7 @@
                         ## output(u"%s is a redirect" % page2.aslink())
                         redirectto = m.group(1)
                         if section and not "#" in redirectto:
-                            redirectto = redirectto+"#"+section
+                            redirectto += "#" + section
                         page2._getexception = IsRedirectPage
                         page2._redirarg = redirectto
 
@@ -3905,13 +3940,13 @@
                     else:
                         flag = u"is '%s', but should be '%s'" % (ns, nshdr)
                     output(u"WARNING: Outdated family file %s: 
namespace['%s'][%i] %s" % (self.site.family.name, lang, id, flag))
-#                    self.site.family.namespaces[id][lang] = nshdr
+                    #self.site.family.namespaces[id][lang] = nshdr
             else:
                 output(u"WARNING: Missing namespace in family file %s: 
namespace['%s'][%i] (it is set to '%s')" % (self.site.family.name, lang, id, 
nshdr))
         for id in self.site.family.namespaces:
             if self.site.family.isDefinedNSLanguage(id, lang) and id not in 
header.namespaces:
                 output(u"WARNING: Family file %s includes namespace['%s'][%i], 
but it should be removed (namespace doesn't exist in the site)" % 
(self.site.family.name, lang, id))
-
+    
     def getData(self):
         address = self.site.export_address()
         pagenames = [page.sectionFreeTitle() for page in self.pages]
@@ -3937,9 +3972,156 @@
         # The XML parser doesn't expect a Unicode string, but an encoded one,
         # so we'll encode it back.
         data = data.encode(self.site.encoding())
-#        get_throttle.setDelay(time.time() - now)
+        #get_throttle.setDelay(time.time() - now)
         return data
 
+    def oneDoneApi(self, data):
+        title = data['title']
+        try:
+            username = data['revisions'][0]['user']
+            ipedit = 'anon' in data['revisions'][0]
+            timestamp = data['revisions'][0]['timestamp']
+            text = data['revisions'][0]['*']
+            editRestriction = ''
+            moveRestriction = ''
+            for revs in data['protection']:
+                if revs['type'] == 'edit':
+                    editRestriction = revs['level']
+                elif revs['type'] == 'move':
+                    moveRestriction = revs['level']
+            revisionId = data['lastrevid']
+        except KeyError:
+            pass
+        
+        page = Page(self.site, title)
+        successful = False
+        for page2 in self.pages:
+            if hasattr(self, '_norm') and page2.sectionFreeTitle() in 
self._norm:
+                page2._title = self._norm[page2.sectionFreeTitle()]
+            
+            if page2.sectionFreeTitle() == page.sectionFreeTitle():
+                if 'missing' in data:
+                    page2._getexception = NoPage
+                    successful = True
+                    break
+                
+                if 'invalid' in data:
+                    page2._getexception = BadTitle
+                    successful = True
+                    break
+                
+                if not (hasattr(page2,'_contents') or 
hasattr(page2,'_getexception')) or self.force:
+                    page2.editRestriction = editRestriction
+                    page2.moveRestriction = moveRestriction
+                    if editRestriction == 'autoconfirmed':
+                        page2._editrestriction = True
+                    page2._permalink = revisionId
+                    page2._userName = username
+                    page2._ipedit = ipedit
+                    page2._revisionId = revisionId
+                    page2._editTime = timestamp
+                    section = page2.section()
+                    # Store the content
+                    page2._contents = text
+                    if 'redirect' in data:
+                        ## output(u"%s is a redirect" % page2.aslink())
+                        m = self.site.redirectRegex().match(text)
+                        redirectto = m.group(1)
+                        if section and not "#" in redirectto:
+                            redirectto += "#" + section
+                        page2._getexception = IsRedirectPage
+                        page2._redirarg = redirectto
+
+                    # This is used for checking deletion conflict.
+                    # Use the data loading time.
+                    page2._startTime = time.strftime('%Y%m%d%H%M%S', 
time.gmtime())
+                    if section:
+                        m = 
re.search("\.3D\_*(\.27\.27+)?(\.5B\.5B)?\_*%s\_*(\.5B\.5B)?(\.27\.27+)?\_*\.3D"
 % re.escape(section), sectionencode(text,page2.site().encoding()))
+                        if not m:
+                            try:
+                                page2._getexception
+                                output(u"WARNING: Section not found: %s" % 
page2.aslink(forceInterwiki = True))
+                            except AttributeError:
+                                # There is no exception yet
+                                page2._getexception = SectionError
+                successful = True
+                # Note that there is no break here. The reason is that there
+                # might be duplicates in the pages list.
+        if not successful:
+            output(u"BUG>> title %s (%s) not found in list" % (title, 
page.aslink(forceInterwiki=True)))
+            output(u'Expected one of: %s' % 
u','.join([page2.aslink(forceInterwiki=True) for page2 in self.pages]))
+            raise PageNotFound
+
+    def headerDoneApi(self, header):
+        p = re.compile('^MediaWiki (.+)$')
+        m = p.match(header['general']['generator'])
+        if m:
+            version = m.group(1)
+            if version != self.site.version():
+                output(u'WARNING: Family file %s contains version number %s, 
but it should be %s' % (self.site.family.name, self.site.version(), version))
+
+        # Verify case
+        if self.site.nocapitalize:
+            case = 'case-sensitive'
+        else:
+            case = 'first-letter'
+        if case != header['general']['case'].strip():
+            output(u'WARNING: Family file %s contains case %s, but it should 
be %s' % (self.site.family.name, case, header.case.strip()))
+
+        # Verify namespaces
+        lang = self.site.lang
+        ids = header['namespaces'].keys()
+        ids.sort()
+        for id in ids:
+            nshdr = header['namespaces'][id]['*']
+            id = header['namespaces'][id]['id']
+            if self.site.family.isDefinedNSLanguage(id, lang):
+                ns = self.site.namespace(id) or u''
+                if ns != nshdr:
+                    try:
+                        dflt = self.site.family.namespace('_default', id)
+                    except KeyError:
+                        dflt = u''
+                    if not ns and not dflt:
+                        flag = u"is not set, but should be '%s'" % nshdr
+                    elif dflt == ns:
+                        flag = u"is set to default ('%s'), but should be '%s'" 
% (ns, nshdr)
+                    elif dflt == nshdr:
+                        flag = u"is '%s', but should be removed (default value 
'%s')" % (ns, nshdr)
+                    else:
+                        flag = u"is '%s', but should be '%s'" % (ns, nshdr)
+                    output(u"WARNING: Outdated family file %s: 
namespace['%s'][%i] %s" % (self.site.family.name, lang, id, flag))
+                    #self.site.family.namespaces[id][lang] = nshdr
+            else:
+                output(u"WARNING: Missing namespace in family file %s: 
namespace['%s'][%i] (it is set to '%s')" % (self.site.family.name, lang, id, 
nshdr))
+        for id in self.site.family.namespaces:
+            if self.site.family.isDefinedNSLanguage(id, lang) and u'%i' % id 
not in header['namespaces']:
+                output(u"WARNING: Family file %s includes namespace['%s'][%i], 
but it should be removed (namespace doesn't exist in the site)" % 
(self.site.family.name, lang, id ) )
+            
+    def getDataApi(self):
+        pagenames = [page.sectionFreeTitle() for page in self.pages]
+        ## We need to use X convention for requested page titles.
+        #if self.site.lang == 'eo':
+        #    pagenames = [encodeEsperantoX(pagetitle) for pagetitle in 
pagenames]
+        
+        params = {
+            'action': 'query',
+            'meta':'siteinfo',
+            'prop': ['info', 'revisions'],
+            'titles': pagenames,
+            'siprop': ['general', 'namespaces'],
+            'rvprop': ['content', 'timestamp', 'user', 'comment', 
'size'],#'ids', 
+            'inprop': ['protection', 'talkid', 'subjectid'], #, 'url', 
'readable'
+        }
+        
+        # Slow ourselves down
+        get_throttle(requestsize = len(self.pages))
+        # Now make the actual request to the server
+        now = time.time()
+        
+        #get_throttle.setDelay(time.time() - now)
+        return query.GetData(params, self.site)
+    
 def getall(site, pages, throttle=True, force=False):
     """Use Special:Export to bulk-retrieve a group of pages from site
 
@@ -3949,7 +4131,11 @@
     """
     # TODO: why isn't this a Site method?
     pages = list(pages)  # if pages is an iterator, we need to make it a list
-    output(u'Getting %d pages from %s...' % (len(pages), site))
+    output(u'Getting %d pages from %s' % (len(pages), site), newline = False)
+    #if config.use_api:
+    #    output(u' via API...')
+    #else:
+    output(u'...')
     limit = config.special_page_limit / 4 # default is 500/4, but It might 
have good point for server.
     
     if len(pages) > limit:



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [7751] trunk/pywikipedia/wikipedia.py

Reply via email to