Here's some code for you. It doesn't do any form input, and you might consider making it more friendly to the webserver with some sleep lines, depending on who you're scraping.
use strict; #use warnings; use LWP::UserAgent; use HTML::TokeParser; use HTTP::COOKIES::Netscape; ############################################ my @keys = ('Bush', '"John Kerry"'); #THIS WORKS IF YOU COLLECT PERSISTENT COOKIES IN A MOZILLA/NETSCAPE WINDOWS BROWSING SESSION my $browser=LWP::UserAgent->new(); my $cookie_jar = HTTP::Cookies::Netscape->new(file => "C:/PATH/cookies.txt"); $browser->cookie_jar($cookie_jar); ########################################### GetSearchPages(); GetArticleURLs('urls.txt'); GetDeepArticles('articles.txt'); ########################################### sub GetSearchPages { open (OUTFILE, ">urls.txt"); foreach my $key (@keys) { my $url="http://search.businessweek.com/Search?searchTerm=$key&skin=BusinessWee k&x=9&y=5"; print OUTFILE "$url\n"; while($url=CheckForNext($url)) { print OUTFILE "$url\n"; } } close OUTFILE; } sub CheckForNext { my $response=$browser->get ("$_[0]"); my $content = $response->content; my $stream = HTML::TokeParser->new(\$content) || die "Coulnd't read HTML $content BLAH BLAH LAH"; my $flag=0; while(my $token=$stream->get_token) { if ($token->[0] eq 'T') { if ($token->[1]=~/page:/) { $flag=1; } } if ($flag==1) { if ($token->[0] eq 'S') { my $remember=$token->[4]; $token=$stream->get_token; if ($token->[1] =~/Next/) { (my $crap, my $url,my $crap2) = split(/'/,$remember); $url =~ s/&/&/g; $url = URI->new_abs($url,'http://search.businessweek.com/')->canonical; return $url; } } } } return ""; } sub GetArticleURLs { open (URLS, "$_[0]"); open (OUTFILE, ">articles.txt"); while (<URLS>) { my $flag=0; my $response=$browser->get("$_"); my $content = $response->content; my $stream = HTML::TokeParser->new(\$content) || die "Coulnd't read HTML $content BLAH BLAH LAH"; while(my $token=$stream->get_token) { if ($token->[0] eq 'T') { if ($token->[1]=~ /BUSINESSWEEK RESULTS/) { $flag=1; } } if ($token->[0] eq 'T') { if ($token->[1]=~ /Result page/) { $flag=0; } } if ($flag==1) { if ($token->[0] eq 'S') { if ($token->[4] =~/href/) { (my $crap, my $url,my $crap2) = split(/'/,$token->[4]); if ($url =~ /AdvancedSearch\?searchTerm/) { $url=""; } if ($url) { print OUTFILE "$url\n";} } } } } } close OUTFILE; close URLS; } sub GetDeepArticles { open (ARTICLES, "$_[0]"); open (OUTFILE, ">articles_deep.txt"); while (<ARTICLES>) { print OUTFILE $_; my $response=$browser->get ("$_"); my $content = $response->content; my $stream = HTML::TokeParser->new(\$content) || die "Coulnd't read HTML $content BLAH BLAH LAH"; my $flag=0; while(my $token=$stream->get_token) { if ($token->[0] eq 'T') { if ($token->[1]=~/Continued on/) { $flag=1; do { $token=$stream->get_token; }until ($token->[4] =~ /href/); } } if ($flag==1) { (my $crap, my $url,my $crap2) = split(/\"/,$token->[4]); $url = URI->new_abs($url,'http://www.businessweek.com/')->canonical; if ($url) { print OUTFILE "$url\n"; $response=$browser->get ("$url"); $content = $response->content; $stream = HTML::TokeParser->new(\$content) || die "Coulnd't read HTML $content BLAH BLAH LAH"; } $flag=0; } } } close OUTFILE; close ARTICLES; } -----Original Message----- From: Wang, Anita [mailto:[EMAIL PROTECTED] Sent: Fri 3/18/2005 7:41 PM To: libwww@perl.org Subject: Is LWP the right module to use? Greetings! I'm writing a program which requires me to set some cookies, send some requests, and then process the query log generated. I wonder if LWP is the right module to accomplish the first two tasks (i.e. set cookies and send requests), since I've never used this module before. I'd appreciate your help! Anita Wang Personalization QA Email: [EMAIL PROTECTED] Phone: 206-266-3366 Office: 5202.09