Matthias Kaulartz said...
>
>Do you know how to speak to the libwww module to use GET with formdata?
>

Yes, I do. Here is an example. Save this code into a file, call it 'search'.
At the command line type 'perl search perl' or 'perl search oracle' or 
'perl search "home cooking"...

I'd strip the comments. It really isn't that hard to do.:)

#####################################################################
#!/usr/bin/perl
use strict;

use HTML::TokeParser;
use LWP::UserAgent;
use HTTP::Request;
use HTML::Form;
use URI;

# I use dumpvar.pl to 'see' the data structures. 
# Like the x command in the debugger

require 'dumpvar.pl';

my $found;

# simple command line interface
# could sex it up with use Getopt::Long
# 
my $what = $ARGV[0]; 
$| = 1;

my $base = 'http://www.amazon.com'; # the base URL
my $path = 'exec/obidos/ats-query-page/'; # the advanced search path

# Create a user agent
my $ua = LWP::UserAgent->new();

# Name our new agent
$ua->agent('BookSearch/1.0');

# Create a uri object
my $uri = URI->new_abs($path,$base);

# Create a request object
my $req = HTTP::Request->new(GET => $uri);

# Request the request object
my $res = $ua->request($req);

if ( $res->is_success ) {
# parse all forms in the returned page
        my @forms = HTML::Form->parse($res->content,$base);
        
# Now look for the form that we want
# The Author, Title, Subject form has an action path of '/exec/obidos/ats-query'
        my $form;
        for (@forms) {
                 $form = $_ if $_->action =~ /ats-query/;
                 last if $form;
        }
        if ($form) {
# Now we fill out the form
                $form->value('title-mode','word');
# $what is the word we are looking for in the title
# we got this from @ARGV above
                $form->value('title',$what);
# sort the output by publication date
                $form->value('sort-type','publication-date');
# the click method returns a new request object
                $req = $form->click;
# so now request it!!
                $res = $ua->request($req);

# parse_text prints the books found along with the desired 
# info. It returns the number of books found.
                $found += parse_text($res); # different response object
                        
# Here we look for the continue form just in case there are 'More'
# books to find.
                while ( @forms = HTML::Form->parse($res->content,$res->base)) {
                        for(@forms) {
                          $form = ($_->{action} =~ /subsequent-query/)?$_:{};
                        }

                        if(exists $form->{action}) {
                          $req = $form->click();
                          print STDERR "Going out again: $found\n";
                          $res = $ua->request($req);
                          print "Next bunch\n\n";
                          $found += parse_text($res); # different response object
                        }
                        last if $form->{action} !~ /subsequent-query/;
                }
                print "Total Books found: $found\n";
        }
} else {
        print 'The request failed : ',$res->status_line,"\n";
}

# if we can print it to the screen we can stick it in a database
# but I need to see the schema first.

sub parse_text {
# passed in response object
        my $resp = shift;
# parse the content
        my $p = HTML::TokeParser->new(\$resp->content);
        my $found = 0;
        my $uri;
        my $req;
        my $res;

# The following all depends on what the web page looks like
# all this will have to change if Amazon changes the design of the 
# page. You have to look at the returned pages yourself and write
# the code to parse a given page.

        while($p->get_tag('dt')) {
                my $url = $p->get_tag('a');
                $url = $url->[1]{href};
                $url =~ m{/exec/obidos/ASIN/(.*)/qid=\d+/sr=\d+\-\d+/.*};
                my $isbn = $1;
                chomp $isbn;
                my $title = $p->get_trimmed_text('/a');
                $p->get_tag('dd');
                my $text = $p->get_trimmed_text('p');
                $text =~ s/Read more about this title.*//;
                next if $text =~ /Not Yet Published/;
                my $data;
                @{$data} = split(m{/},$text,3);
                unless (@{$data} == 3) {
                  if(@{$data} == 1) {
                        for(1..2) {
                          push(@{$data},'unknown');
                        }
                  } else {
                        unshift(@{$data},'unknown');
                  }
                }
                chomp @{$data};
                $data->[2] =~ /Our Price:\s*([^ ]+)/g;
                $data->[3] = $1;
                $data->[0] ||= 'unknown';
                $data->[1] ||= 'unknown';
                $data->[2] =~ /(Published\s+\d\d\d\d).*/;
                $data->[4] = $1;
                $data->[2] ||= 'unknown';

# uncomment this if you want to see what $data looks like
#               dumpValue(\$data);

                print "Title: $title\n";
                print "ISBN: $isbn\n";
                print "Author: $data->[0]\n" ;
                print "Book Type: $data->[1]\n" if $data->[1] ;
                print "Published: $data->[4]\n" if $data->[4] ;
                print 'Price: ',$data->[3],"\n"; 
                print "\n\n" ;
                $found++;
        }
        return $found;
}

############################################################

-- 
Tim Allwine
IX Development Laboratories
  • GET Matthias Kaulartz
    • tallwine

Reply via email to