Hello there,
I wrote a script to scrape businessweek's search results. It
worked fine, but now I am trying to authenticate my agent to businessweek
first, before I do my search, so that my search results don't point at
register pages, and so I can access the results and parse them. I realize my
code is ghetto, but that's because I did not understand the better Perl HTML
parsing modules.
The first script is my script that works.
The second is my mangled attempt to authenticate.
Any help would be much appreciated.
use LWP::Simple;
use HTML::SimpleParse;
use Win32API::File 0.08 qw( :ALL );
use LWP::UserAgent;
use Win32::OLE;
use Win32::SAM;
use Win32::Slingshot;
$| = 1;
my @words = ('Different',
'key+words');
my $ref = -1;
foreach (@words){
$ref++;
@index[$ref]=get
(http://search.businessweek.com/[EMAIL PROTECTED]skin=Busines
sWeekx=9y=5);
$p = new HTML::SimpleParse( $index[$ref] );
open(OUTFILE, output[$ref].txt) or die Can't open
output.txt: $!;
$flag = 0;
$test=0;
foreach ($p-tree) {
if ($p-execute($_) =~ /Results /)
{
$flag=1;
}
if ($flag==1)
{
$test++;
print OUTFILE $p-execute($_);
if ($p-execute($_) =~ /Result page/)
{
$flag = 0;}
}
}
print There were $test lines saved for parsing for @words[$ref]
\n;
close OUTFILE;
open(INFILE, output[$ref].txt) or die Can't open output.txt:
$!;
open(OUTFILE, goodies[$ref].txt) or die Can't open
goodies.txt: $!;
while (INFILE)
{
if ($_ =~ /a href/ )
{
($url,$BetweenTheBold) = $_ =~ /.*'(.*)'.*b(.*)\/b/ ;
print OUTFILE $url\n;
print OUTFILE $BetweenTheBold\n;
}
elsif ($_ =~ /\d{2}/ )
{($date) = $_ =~
/-.*((January|February|September|November|December|March|April|May|June|July
|Augu
st|October).{2}.*\d{4}).*/ ;
print OUTFILE $date\n\n;
}
}
close INFILE;
close OUTFILE;
}
my $var=-1;
open(OUTFILE, total.txt) or die Can't open total.txt: $!;
while ($var $ref)
{ $var++;
open(INFILE, goodies[$var].txt) or die Can't open
goodies.txt: $!;
while (INFILE)
{if ($_ =~ /\w/)
{print OUTFILE $_;}
}
close INFILE;
DeleteFile (goodies[$var].txt);
DeleteFile (output[$var].txt);
}
close OUTFILE;
AND WITH AUTHENTICATION
use LWP::Simple;
use HTML::SimpleParse;
use Win32API::File 0.08 qw( :ALL );
use LWP::UserAgent;
use Win32::OLE;
use Win32::SAM;
use Win32::Slingshot;
$| = 1;
my @words = ('Different',
'key+words');
#AUTHENTICATE
my $browser = LWP::UserAgent-new;
$browser-credentials(
'www-secure.businessweek.com',
'',
'andrewljohnson' = 'hermit85'
);
my $ref = -1;
foreach (@words){
$ref++;
@index[$ref]=$browser-get
(http://search.businessweek.com/[EMAIL PROTECTED]skin=Busines
sWeekx=9y=5);
$p = new HTML::SimpleParse( $index[$ref] );
open(OUTFILE, output[$ref].txt) or die Can't open
output.txt: $!;
$flag = 0;
$test=0;
foreach ($p-tree) {
if ($p-execute($_) =~ /Results /)
{
$flag=1;
}
if ($flag==1)
{
$test++;
print OUTFILE $p-execute($_);
if ($p-execute($_) =~ /Result page/)
{
$flag = 0;}
}
}
print There were $test lines saved for parsing for @words[$ref]
\n;
close OUTFILE;
open(INFILE, output[$ref].txt) or die Can't open output.txt:
$!;
open(OUTFILE, goodies[$ref].txt) or die Can't open
goodies.txt: $!;
while (INFILE)
{
if ($_ =~ /a href/ )
{
($url,$BetweenTheBold) = $_ =~ /.*'(.*)'.*b(.*)\/b/ ;
print OUTFILE $url\n;
print OUTFILE $BetweenTheBold\n;
}
elsif ($_ =~ /\d{2}/ )
{($date) = $_ =~