Hey list,
I've been working on my spider program (albeit not very much :P) and I
would appreciate it if I could get some comments on the code. I'm fairly
sure I haven't chosen the best method to do what I want to do, but if we
can just assume that I have, that'll make things easier. ;)

In particular, comments on the "Things to do" list at the end would be
much appreciated.

The file is attached.

Cheers,
Dan
#!/usr/bin/python

### sPyder - A program designed to retrieve a particular, related set of web pages, using a spider-type search.
### sPyder is released under the GPL.

import re
import urllib
import distutils.dir_util
import os
from optparse import OptionParser

DEFAULT_DEFS = '[cf]=".*html"|c=".*jpg"|c=".*gif"|c=".*bmp"|[cf]="/\w*/' # These should be moved to a configuration file, ideally.

def main():
    ### This is the class which brings all of the other classes together, as well as providing the command line parsing interface.
    
    # This is all command line parsing
        usage = "usage: %prog -s <remote site> -d <local destination> -f <source file> -defs <definitions file>"
    parser = OptionParser(usage=usage)

    parser.add_option("-s","--site",help="the site to be copied (must start with 'http://'")
    parser.add_option("-l","--local_destination",dest="localdir",help="the destination the site will be copied to")
    parser.add_option("-f","--file",dest="source",help="the file to start at (assumed to be 'index.html' if not specified)",default="index.html")
    parser.add_option("-d","--definitions",help="a file containing definitions of what files to copy (reverts to defaults if not specified)",default=None)

    (options, args) = parser.parse_args()

    # Preparing and tidying up the input for use by the actual fetching classes (copy_items)
    site = options.site
    localdir = options.localdir
    source = options.source
    if options.definitions:
        definitions = options.definitions # This is not implemented correctly, it is just a placeholder.
    else:
        definitions = DEFAULT_DEFS

    if not re.search('/\Z',site):
        site = site + "/"
    if not re.search('/\Z',localdir):
        localdir = localdir + "/"

    if site and localdir:
        copy_items(site,localdir,source,definitions)

def get_page_items (site,localdir,source,defs):
    ### This class returns a list of all the items within a page which fit the definitions.
    
    next = []
    text = urllib.urlopen(site+source).read()

    if re.search(defs,text):
        for i in re.findall(defs,text):
            i = i[3:-1]
            if re.search('/\w*/\Z',i):
                j = os.listdir(site+i)
                next.append(j)
            else:
                next.append(i)
        
    src = [source] + next
    print src
    return src
        
def get_list (site,localdir,source,defs):
    ### This class returns a list of the pages which need to be copied.
    
    items = []
    next = get_page_items (site,localdir,source,defs)
    for i in next:
        if i not in items:
            items.append (i)
            next.extend (get_page_items (site,localdir,i,defs))
    return items

def copy_items (site,localdir,source,defs):
    ### This class copies the list of files which has been compiled.
    
    items = get_list(site,localdir,source,defs)
    for i in items:
        distutils.dir_util.create_tree(localdir, items)
        original = urllib.urlopen(site+i)
        local = open(localdir+i,'w')
        body = original.read()
        local.write(body)
        local.close()

main()

#Things to do:
# - Create 'definitions' file
# - Create support for straight references to a folder
# - Look into compiling RE for greater efficiency
_______________________________________________
Tutor maillist  -  Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor

Reply via email to