One of the problem GAE users face is how to run periodic/cpu intensive
tasks. Since GAE limits each request to 10secs, most can not run tasks
that take longer than 10 secs. And periodic jobs are not supported by
GAE. One of the suggested method is to call a GAE url periodically
from an external service/box.

Here is the work around to do cron without any external service/box
support. This library provides two functions timer and loop.
timer allows a callback function to be called at periodic intervals.
each
time a different http request is used to call the callback function.
loop allows a callback function to be called on a every member of
list. each callback is called in a different http request. There is no
limit on no of elements on list.

So using this library i wrote a program to fetch list of urls at every
one hour. Here is the complete code.
I think programs like crawlers or google news kind of sites can be
built with this library.
Warning: This makes heavy use of urlfetch and maximum no urlfetch per
day is 160000, so be careful not to reach that limit. You will get
many errors saying "deadline exceed" and that is expected.

So here is the complete code. To start the program point your browser
to http://<app-id>.appspot.com/start and to stop the program point
your browser to http://<app-id>.appspot.com/stop

#!/usr/bin/env python
import os
from cStringIO import StringIO
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext.webapp import template
from google.appengine.api import urlfetch
from google.appengine.api import memcache
import time
import logging
import md5

def mysleep(r):
        time.sleep(3)

def geturl(url):
        try:
                urlfetch.fetch(url)
        except :
                pass

def geturlkey(url):
        n=md5.md5()
        n.update(url)
        return n.hexdigest()

def timer(func, interval):
        timerlist = memcache.get('timer')
        if(None == timerlist):
                timerlist = []
        timerlist.append({'func':func, 'interval':interval})
        memcache.set('timer-'+func, '1', interval)
        memcache.set('timer', timerlist)

def loop(func, args):
        looplist = memcache.get('loop')
        if(None == looplist):
                looplist = []
        looplist.append({'func':func, 'args':args})
        memcache.set('loop', looplist)

def handletimer(host, uindex):
        timerlist = memcache.get('timer')
        if(None == timerlist):
                return False
        current = None
        for index in range(uindex, len(timerlist)):
                if(None == memcache.get('timer-'+timerlist[index]['func'])):
                        current = timerlist[index]
                        break
        if(current == None):
                for index in range(0, uindex):
                        if(None == 
memcache.get('timer-'+timerlist[index]['func'])):
                                current = timerlist[index]
                                break
        if(current is not None):
                memcache.set('timer-'+current['func'], '1', current['interval'])
                try:
                        eval(current['func']+'()')
                except:
                        pass
                if((index+1) == len(timerlist)):
                        index = -1
                geturl(host+'/next?t=t&i='+str(index+1))
                return True
        else:
                return False

def handleloop(host, uindex):
        looplist = memcache.get('loop')
        if(None == looplist):
                return False
        if((len(looplist) > 0) and (len(looplist[0]['args']) > 0)):
                arg = looplist[0]['args'].pop(0)
                func = looplist[0]['func']
                if(len(looplist[0]['args']) == 0):
                        looplist.pop(0)
                if((len(looplist) > 0) and (len(looplist[0]['args']) > 0)):
                        memcache.set('loop', looplist)
                else:
                        memcache.delete('loop')
                try:
                        eval(func+'('+repr(arg)+')')
                except:
                        pass
                geturl(host+'/next?t=l&i='+str(uindex+1))
                return True
        else:
                return False

class MainPage(webapp.RequestHandler):
        def get(self):
                self.response.out.write('hello world')

class StartPage(webapp.RequestHandler):
        def get(self):
                data = memcache.get('status')
                if(data == 'running'):
                        self.response.out.write("fail")
                        return
                memcache.set('status', 'running')
                memcache.delete('timer')
                memcache.delete('loop')
                startfunction()
                geturl("http://"+self.request.headers["HOST"]+'/task')
                self.response.out.write("ok")

class StopPage(webapp.RequestHandler):
        def get(self):
                memcache.set('status', 'stop')
                self.response.out.write("ok")

class TimerPage(webapp.RequestHandler):
        def get(self):
                self.response.out.write("ok")
                if not ('running' == memcache.get('status')):
                        return
                index = int(self.request.get('i', '0'))
                if(False == handletimer("http://"+self.request.headers["HOST";],
index)):
                        retry = int(self.request.get('r', '0'))
                        mysleep(retry)
                        
geturl("http://"+self.request.headers["HOST"]+'/task?r='+str(retry
+1))

class TaskPage(webapp.RequestHandler):
        def get(self):
                self.response.out.write("ok")
                if not ('running' == memcache.get('status')):
                        return
                if(False == handleloop("http://"+self.request.headers["HOST";], 
0)):
                        if(False == 
handletimer("http://"+self.request.headers["HOST";],
0)):
                                retry = int(self.request.get('r', '0'))
                                mysleep(retry)
                                
geturl("http://"+self.request.headers["HOST"]+'/sleep?r='+str(retry
+1))

class LoopPage(webapp.RequestHandler):
        def get(self):
                self.response.out.write("ok")
                if not ('running' == memcache.get('status')):
                        return
                index = int(self.request.get('i', '0'))
                if(False == handleloop("http://"+self.request.headers["HOST";],
index)):
                        if(False == 
handletimer("http://"+self.request.headers["HOST";],
0)):
                                retry = int(self.request.get('r', '0'))
                                
geturl("http://"+self.request.headers["HOST"]+'/task?r='+str(retry
+1))

class NextPage(webapp.RequestHandler):
        def get(self):
                self.response.out.write("ok")
                if not ('running' == memcache.get('status')):
                        return
                index = int(self.request.get('i', '0'))
                urlt = self.request.get('t', 'l')
                url = "loop"
                if(urlt == 't'):
                        url = "timer"
                geturl("http://"+self.request.headers["HOST"]+'/'+url+'?
i='+str(index))

class SleepPage(webapp.RequestHandler):
        def get(self):
                self.response.out.write("ok")
                if not ('running' == memcache.get('status')):
                        return
                retry = int(self.request.get('r', '0'))
                mysleep(retry)
                
geturl("http://"+self.request.headers["HOST"]+'/task?r='+str(retry
+1))

application = webapp.WSGIApplication([('/', MainPage),
                                     ('/start', StartPage),
                                     ('/stop', StopPage),
                                     ('/task', TaskPage),
                                     ('/loop', LoopPage),
                                     ('/timer', TimerPage),
                                     ('/next', NextPage),
                                     ('/sleep', SleepPage)],
                                     debug=True)

def main():
        run_wsgi_app(application)

if __name__ == "__main__":
        main()

#user's code
#list of urls to be fetched
urllist = ['http://www.google.com/', 'http://www.cnn.com/', 'http://
www.yahoo.com', 'http://news.google.com']
def getone(url):
        try:
                result = urlfetch.fetch(url)
                if(result.status_code == 200):
                        memcache.set(geturlkey(url), '1', 60*60)
        except :
                pass

def getallurl():
        global urllist
        fetchlist = []
        for url in urllist:
                if (memcache.get(geturlkey(url)) is None):
                        fetchlist.append(url)
        #this is equivalent to
        #for url in fetchlist: getone(url)
        if(len(fetchlist) > 0):
                loop('getone', fetchlist)

def startfunction():
        #function getallurl will be called every 60*60 seconds
        timer('getallurl', 60*60)


--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"Google App Engine" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to [EMAIL PROTECTED]
For more options, visit this group at 
http://groups.google.com/group/google-appengine?hl=en
-~----------~----~----~----~------~----~------~--~---

Reply via email to