Hey list, I've been working on my spider program (albeit not very much :P) and I would appreciate it if I could get some comments on the code. I'm fairly sure I haven't chosen the best method to do what I want to do, but if we can just assume that I have, that'll make things easier. ;)
In particular, comments on the "Things to do" list at the end would be much appreciated. The file is attached. Cheers, Dan
#!/usr/bin/python ### sPyder - A program designed to retrieve a particular, related set of web pages, using a spider-type search. ### sPyder is released under the GPL. import re import urllib import distutils.dir_util import os from optparse import OptionParser DEFAULT_DEFS = '[cf]=".*html"|c=".*jpg"|c=".*gif"|c=".*bmp"|[cf]="/\w*/' # These should be moved to a configuration file, ideally. def main(): ### This is the class which brings all of the other classes together, as well as providing the command line parsing interface. # This is all command line parsing usage = "usage: %prog -s <remote site> -d <local destination> -f <source file> -defs <definitions file>" parser = OptionParser(usage=usage) parser.add_option("-s","--site",help="the site to be copied (must start with 'http://'") parser.add_option("-l","--local_destination",dest="localdir",help="the destination the site will be copied to") parser.add_option("-f","--file",dest="source",help="the file to start at (assumed to be 'index.html' if not specified)",default="index.html") parser.add_option("-d","--definitions",help="a file containing definitions of what files to copy (reverts to defaults if not specified)",default=None) (options, args) = parser.parse_args() # Preparing and tidying up the input for use by the actual fetching classes (copy_items) site = options.site localdir = options.localdir source = options.source if options.definitions: definitions = options.definitions # This is not implemented correctly, it is just a placeholder. else: definitions = DEFAULT_DEFS if not re.search('/\Z',site): site = site + "/" if not re.search('/\Z',localdir): localdir = localdir + "/" if site and localdir: copy_items(site,localdir,source,definitions) def get_page_items (site,localdir,source,defs): ### This class returns a list of all the items within a page which fit the definitions. next = [] text = urllib.urlopen(site+source).read() if re.search(defs,text): for i in re.findall(defs,text): i = i[3:-1] if re.search('/\w*/\Z',i): j = os.listdir(site+i) next.append(j) else: next.append(i) src = [source] + next print src return src def get_list (site,localdir,source,defs): ### This class returns a list of the pages which need to be copied. items = [] next = get_page_items (site,localdir,source,defs) for i in next: if i not in items: items.append (i) next.extend (get_page_items (site,localdir,i,defs)) return items def copy_items (site,localdir,source,defs): ### This class copies the list of files which has been compiled. items = get_list(site,localdir,source,defs) for i in items: distutils.dir_util.create_tree(localdir, items) original = urllib.urlopen(site+i) local = open(localdir+i,'w') body = original.read() local.write(body) local.close() main() #Things to do: # - Create 'definitions' file # - Create support for straight references to a folder # - Look into compiling RE for greater efficiency
_______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor