Hello there,

            I wrote a script to scrape businessweek's search results. It
worked fine, but now I am trying to authenticate my agent to businessweek
first, before I do my search, so that my search results don't point at
register pages, and so I can access the results and parse them. I realize my
code is ghetto, but that's because I did not understand the better Perl HTML
parsing modules. 

 

The first script is my script that works. 

 

The second is my mangled attempt to authenticate. 

 

Any help would be much appreciated.

 

use LWP::Simple;

use HTML::SimpleParse;

use Win32API::File 0.08 qw( :ALL );

use LWP::UserAgent;

use Win32::OLE;

use Win32::SAM;

use Win32::Slingshot;

 

$| = 1;

my @words = ('Different',

                 '"key+words"');

 

my $ref = -1;

foreach (@words){

            $ref++;

            @index[$ref]=get                       

 
("http://search.businessweek.com/[EMAIL PROTECTED]&skin=Busines
sWeek&x=9&y=5");

            $p = new HTML::SimpleParse( $index[$ref] );

            open(OUTFILE, ">output[$ref].txt") or die "Can't open
output.txt: $!";

 

            $flag = 0;

            $test=0;

 

  foreach ($p->tree) {  

            if ($p->execute($_) =~ /Results /)      

                        {

                        $flag=1;

                        }

            if ($flag==1)      

                        {

                        

                        $test++;

                        print OUTFILE $p->execute($_);

                        if ($p->execute($_) =~ /Result page/)

                        { 

                          $flag = 0;}

                        }

            

            }

            print "There were $test lines saved for parsing for @words[$ref]
\n";

            close OUTFILE;

            open(INFILE, "output[$ref].txt") or die "Can't open output.txt:
$!"; 

            open(OUTFILE, ">goodies[$ref].txt") or die "Can't open
goodies.txt: $!";

 

  while (<INFILE>)

            {

            if ($_ =~ /<a href/ )

                    {

               ($url,$BetweenTheBold) = $_ =~ /.*'(.*)'.*<b>(.*)<\/b>/ ;

                print OUTFILE "$url\n";

                print OUTFILE "$BetweenTheBold\n";

                        }

            elsif ($_ =~ /\d{2}/ )

                        {($date) = $_ =~ 

/-.*((January|February|September|November|December|March|April|May|June|July
|Augu

st|October).{2}.*\d{4}).*/ ;

                        print OUTFILE "$date\n\n";

                        }           

            }

close INFILE;

close OUTFILE;

                        }

 

my $var=-1;

open(OUTFILE, ">total.txt") or die "Can't open total.txt: $!"; 

while ($var < $ref)

 

{           $var++;

            open(INFILE, "goodies[$var].txt") or die "Can't open
goodies.txt: $!";

            while (<INFILE>)

            {if ($_ =~ /\w/)

            {print OUTFILE $_;} 

            }

            close INFILE;

            DeleteFile ("goodies[$var].txt");

            DeleteFile ("output[$var].txt");

}

close OUTFILE;

 

AND WITH AUTHENTICATION

 

use LWP::Simple;

use HTML::SimpleParse;

use Win32API::File 0.08 qw( :ALL );

use LWP::UserAgent;

use Win32::OLE;

use Win32::SAM;

use Win32::Slingshot;

 

$| = 1;

my @words = ('Different',

                 '"key+words"');

 

#AUTHENTICATE

 

my $browser = LWP::UserAgent->new;

$browser->credentials(

    'www-secure.businessweek.com',

    '',

    'andrewljohnson' => 'hermit85'

  );

 

 

my $ref = -1;

foreach (@words){

            $ref++;

            @index[$ref]=$browser->get                   

 
("http://search.businessweek.com/[EMAIL PROTECTED]&skin=Busines
sWeek&x=9&y=5");

            $p = new HTML::SimpleParse( $index[$ref] );

            open(OUTFILE, ">output[$ref].txt") or die "Can't open
output.txt: $!";

 

            $flag = 0;

            $test=0;

 

  foreach ($p->tree) {  

            if ($p->execute($_) =~ /Results /)      

                        {

                        $flag=1;

                        }

            if ($flag==1)      

                        {

                        

                        $test++;

                        print OUTFILE $p->execute($_);

                        if ($p->execute($_) =~ /Result page/)

                        { 

                          $flag = 0;}

                        }

            

            }

            print "There were $test lines saved for parsing for @words[$ref]
\n";

            close OUTFILE;

            open(INFILE, "output[$ref].txt") or die "Can't open output.txt:
$!"; 

            open(OUTFILE, ">goodies[$ref].txt") or die "Can't open
goodies.txt: $!";

 

  while (<INFILE>)

            {

            if ($_ =~ /<a href/ )

                    {

               ($url,$BetweenTheBold) = $_ =~ /.*'(.*)'.*<b>(.*)<\/b>/ ;

                print OUTFILE "$url\n";

                print OUTFILE "$BetweenTheBold\n";

                        }

            elsif ($_ =~ /\d{2}/ )

                        {($date) = $_ =~ 

/-.*((January|February|September|November|December|March|April|May|June|July
|Augu

st|October).{2}.*\d{4}).*/ ;

                        print OUTFILE "$date\n\n";

                        }           

            }

close INFILE;

close OUTFILE;

                        }

 

my $var=-1;

open(OUTFILE, ">total.txt") or die "Can't open total.txt: $!"; 

while ($var < $ref)

 

{           $var++;

            open(INFILE, "goodies[$var].txt") or die "Can't open
goodies.txt: $!";

            while (<INFILE>)

            {if ($_ =~ /\w/)

            {print OUTFILE $_;} 

            }

            close INFILE;

            DeleteFile ("goodies[$var].txt");

            DeleteFile ("output[$var].txt");

}

close OUTFILE;

 

 

 

(    Andrew Johnson                                     ) 

  )  Marketing Writer                                   (

 (   Elias/Savion Advertising                          )
 (   Phone: 412.642.7700 Fax 412.642.2277   )
  )  www.elias-savion.com                            (

(    [EMAIL PROTECTED]            )
           

 

Reply via email to