RE: Is LWP the right module to use?

Andrew Johnson Sat, 19 Mar 2005 09:11:48 -0800


Here's some code for you. It doesn't do any form input, and you might
consider making it more friendly to the webserver with some sleep lines,
depending on who you're scraping.


use strict;
#use warnings;
use LWP::UserAgent;
use HTML::TokeParser;
use HTTP::COOKIES::Netscape;

############################################

my @keys = ('Bush',
             '"John Kerry"');

#THIS WORKS IF YOU COLLECT PERSISTENT COOKIES IN A MOZILLA/NETSCAPE WINDOWS
BROWSING SESSION
my $browser=LWP::UserAgent->new();
my $cookie_jar = HTTP::Cookies::Netscape->new(file =>
"C:/PATH/cookies.txt");
$browser->cookie_jar($cookie_jar);

###########################################


GetSearchPages();
GetArticleURLs('urls.txt');
GetDeepArticles('articles.txt');



###########################################

sub GetSearchPages
{
        
        open (OUTFILE, ">urls.txt");

        foreach my $key (@keys)
        {
                my
$url="http://search.businessweek.com/Search?searchTerm=$key&skin=BusinessWee
k&x=9&y=5";
                print OUTFILE "$url\n";
                
                while($url=CheckForNext($url))
                {       
                        print OUTFILE "$url\n";
                }

        
        }
close OUTFILE;

}

sub CheckForNext
{
        
        my $response=$browser->get ("$_[0]");
        my $content = $response->content;
        my $stream = HTML::TokeParser->new(\$content)
        || die "Coulnd't read HTML $content BLAH BLAH LAH";
        my $flag=0;
        
        while(my $token=$stream->get_token)
        {
                  if ($token->[0] eq 'T')
                  {
                        if ($token->[1]=~/page:/)
                        {
                                 $flag=1;
                        }
                  }
                
                  if ($flag==1)
                  {     
                        if ($token->[0] eq 'S')
                        {
                                my $remember=$token->[4]; 
                                $token=$stream->get_token;
                                if ($token->[1] =~/Next/)
                                {                
                                        
                                        (my $crap, my $url,my $crap2) =
split(/'/,$remember); 
                                        $url =~ s/&amp;/&/g;
                                        $url =
URI->new_abs($url,'http://search.businessweek.com/')->canonical;

                                        return $url;
                                }
                        }               
                   }                      
         }

return "";              
}


sub GetArticleURLs
{
        open (URLS, "$_[0]");
        open (OUTFILE, ">articles.txt");
        while (<URLS>)
        {       
                my $flag=0;
                my $response=$browser->get("$_");
                my $content = $response->content;
                my $stream = HTML::TokeParser->new(\$content)
                || die "Coulnd't read HTML $content BLAH BLAH LAH";
                while(my $token=$stream->get_token)
                {  
                         if ($token->[0] eq 'T')
                        {
                                if ($token->[1]=~ /BUSINESSWEEK RESULTS/)
                                {                
                                $flag=1;
                                }
                        }
                  
                        if ($token->[0] eq 'T')
                        {
                                if ($token->[1]=~ /Result page/)
                                {
                                        $flag=0;
                                }
                        }
 
                        if ($flag==1)
                          {     
                                if ($token->[0] eq 'S')
                                {
                                
                                        if ($token->[4] =~/href/)
                                        {                
                                                (my $crap, my $url,my
$crap2) = split(/'/,$token->[4]);
                                                if ($url =~
/AdvancedSearch\?searchTerm/)
                                                {
                                                        $url="";
                                                }

                                                if ($url)
                                                {
                                                        print OUTFILE
"$url\n";}
                                                }
                                        }               
                                }                   
                         }              


                }
close OUTFILE;
close URLS;
}

sub GetDeepArticles
{
        open (ARTICLES, "$_[0]");
        open (OUTFILE, ">articles_deep.txt");

        while (<ARTICLES>)
        {
                print OUTFILE $_;
                my $response=$browser->get ("$_");
                my $content = $response->content;
                my $stream = HTML::TokeParser->new(\$content)
                || die "Coulnd't read HTML $content BLAH BLAH LAH";
                my $flag=0;
        
                while(my $token=$stream->get_token)
                {
                        if ($token->[0] eq 'T')
                        {
                                if ($token->[1]=~/Continued on/)
                                {
                                        $flag=1;
                                        do
                                        {
                                                $token=$stream->get_token;

                                        }until ($token->[4] =~ /href/);  
                                }
        
                        }
                
                        if ($flag==1)
                        {               
                                (my $crap, my $url,my $crap2) =
split(/\"/,$token->[4]);
                                $url =
URI->new_abs($url,'http://www.businessweek.com/')->canonical;            
                                        
                                if ($url)
                                {       
                                        print OUTFILE "$url\n";
                                        $response=$browser->get ("$url");
                                        $content = $response->content;
                                        $stream =
HTML::TokeParser->new(\$content)
                                        || die "Coulnd't read HTML $content
BLAH BLAH LAH";
                                }
                                $flag=0;
                        }                 
                 }
        }
close OUTFILE;
close ARTICLES;
}





-----Original Message-----
From: Wang, Anita [mailto:[EMAIL PROTECTED]
Sent: Fri 3/18/2005 7:41 PM
To: libwww@perl.org
Subject: Is LWP the right module to use?
 
Greetings!

I'm writing a program which requires me to set some cookies, send some
requests, and then process the query log generated. I wonder if LWP is the
right module to accomplish the first two tasks (i.e. set cookies and send
requests), since I've never used this module before.

I'd appreciate your help!

Anita Wang
Personalization QA
Email: [EMAIL PROTECTED]
Phone: 206-266-3366
Office: 5202.09

RE: Is LWP the right module to use?

Reply via email to