[EMAIL PROTECTED] wrote: > Greetings gents. I'm a Railser working on a django app that needs to do > some scraping to gather its data. > > I need to programatically access a site that requires a username and > password. Once I post to the login.php page, there seems to be a > redirect and it seems that the site is using a session (perhaps a > cookie) to determine whether the user is logged in. So I need to log in > and then have cookies and or sessions maintained as I access the page > that contains the content that I am actually interested in. > > What is the simplest way to post data to a form, accept a cookie to > maintain the session (and support redirects) and then (now logged into > the site) retrieve the content of a page on the site? > > Is their a library or technique that makes this simple? > > Thanks! >
I submit to you, in its entirety a, a script I wrote to do this. I think its simple enough to figure out the important parts. I left some debugging code in. Sorry for no explanation, etc., but I'm still playing with the new 2.5 distro! This program worked for me, but there may be comment on whether its good code from others. Names have been changed to protect the innocent. James #! /usr/bin/env python import sys import os.path import time import random import urllib import urllib2 import cookielib class DummyError(Exception): pass ####################################################################### # some constants ####################################################################### COOKIEFILE = 'cookies.lwp' pda = "http://www.somemapcompany.com/" signin_params = urllib.urlencode({'email_address':'[EMAIL PROTECTED]', 'password':'yourpasshere', 'action':'log_in', 'Submit':'Sign In'}) signout_params = urllib.urlencode({'action':'sign_out'}) download_dict = {'screen':'map_details', 'action':'download'} info_dict = {'screen':'map_details', 'back':'find_maps', 'view':'state'} ####################################################################### # setup for cookies ####################################################################### cj = cookielib.LWPCookieJar() if os.path.isfile(COOKIEFILE): cj.load(COOKIEFILE) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) ####################################################################### # make the signin request ####################################################################### user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' txheaders = {'User-agent' : user_agent} req = urllib2.Request(pda, signin_params, txheaders) ####################################################################### # download_map ####################################################################### def download_map(map_id, urlopen=urllib2.urlopen): ##################################################################### # download the map ##################################################################### download_dict['map_id'] = map_id download_params = urllib.urlencode(download_dict) mapin = urlopen(pda + ("?%s" % download_params)) mapout = open('map-%s' % map_id, "wb") mapout.write(mapin.read()) mapout.close() ##################################################################### # download the map info ##################################################################### info_dict['map_id'] = map_id info_params = urllib.urlencode(info_dict) infoin = urlopen(pda + ("?%s" % info_params)) infoout = open('info-%s' % map_id, "wb") infoout.write(infoin.read()) infoout.close() ####################################################################### # signin and print info ####################################################################### try: signin = urllib2.urlopen(req) except IOError, e: print 'We failed to open "%s".' % pda if hasattr(e, 'code'): print 'We failed with error code - %s.' % e.code else: print print 'Here are the headers of the page :' print signin.info() afile = open("signin.html", "w") afile.write(signin.read()) afile.close() ####################################################################### # report and save cookies ####################################################################### print for index, cookie in enumerate(cj): print index, ' : ', cookie cj.save(COOKIEFILE, True, True) print print "cookies===>" os.system('cat %s' % COOKIEFILE) print "<===cookies" for map_id in xrange(1001, 1507): try: download_map(map_id) wait = 7.5 + random.randint(0,5) print "=====> waiting %s seconds..." % wait time.sleep(wait) except urllib2.HTTPError, e: # except DummyError, e: print "%s: failed to download" % map_id print " HTTP ERROR:", e else: print "# Downloaded map %s successfully." % map_id signout = urllib.urlopen(pda + ("?%s" % signout_params)) afile = open("signout.html", "w") afile.write(signout.read()) afile.close() -- James Stroud UCLA-DOE Institute for Genomics and Proteomics Box 951570 Los Angeles, CA 90095 http://www.jamesstroud.com/ -- http://mail.python.org/mailman/listinfo/python-list