Hello there, I wrote a script to scrape businessweek's search results. It worked fine, but now I am trying to authenticate my agent to businessweek first, before I do my search, so that my search results don't point at register pages, and so I can access the results and parse them. I realize my code is ghetto, but that's because I did not understand the better Perl HTML parsing modules.
The first script is my script that works. The second is my mangled attempt to authenticate. Any help would be much appreciated. use LWP::Simple; use HTML::SimpleParse; use Win32API::File 0.08 qw( :ALL ); use LWP::UserAgent; use Win32::OLE; use Win32::SAM; use Win32::Slingshot; $| = 1; my @words = ('Different', '"key+words"'); my $ref = -1; foreach (@words){ $ref++; @index[$ref]=get ("http://search.businessweek.com/[EMAIL PROTECTED]&skin=Busines sWeek&x=9&y=5"); $p = new HTML::SimpleParse( $index[$ref] ); open(OUTFILE, ">output[$ref].txt") or die "Can't open output.txt: $!"; $flag = 0; $test=0; foreach ($p->tree) { if ($p->execute($_) =~ /Results /) { $flag=1; } if ($flag==1) { $test++; print OUTFILE $p->execute($_); if ($p->execute($_) =~ /Result page/) { $flag = 0;} } } print "There were $test lines saved for parsing for @words[$ref] \n"; close OUTFILE; open(INFILE, "output[$ref].txt") or die "Can't open output.txt: $!"; open(OUTFILE, ">goodies[$ref].txt") or die "Can't open goodies.txt: $!"; while (<INFILE>) { if ($_ =~ /<a href/ ) { ($url,$BetweenTheBold) = $_ =~ /.*'(.*)'.*<b>(.*)<\/b>/ ; print OUTFILE "$url\n"; print OUTFILE "$BetweenTheBold\n"; } elsif ($_ =~ /\d{2}/ ) {($date) = $_ =~ /-.*((January|February|September|November|December|March|April|May|June|July |Augu st|October).{2}.*\d{4}).*/ ; print OUTFILE "$date\n\n"; } } close INFILE; close OUTFILE; } my $var=-1; open(OUTFILE, ">total.txt") or die "Can't open total.txt: $!"; while ($var < $ref) { $var++; open(INFILE, "goodies[$var].txt") or die "Can't open goodies.txt: $!"; while (<INFILE>) {if ($_ =~ /\w/) {print OUTFILE $_;} } close INFILE; DeleteFile ("goodies[$var].txt"); DeleteFile ("output[$var].txt"); } close OUTFILE; AND WITH AUTHENTICATION use LWP::Simple; use HTML::SimpleParse; use Win32API::File 0.08 qw( :ALL ); use LWP::UserAgent; use Win32::OLE; use Win32::SAM; use Win32::Slingshot; $| = 1; my @words = ('Different', '"key+words"'); #AUTHENTICATE my $browser = LWP::UserAgent->new; $browser->credentials( 'www-secure.businessweek.com', '', 'andrewljohnson' => 'hermit85' ); my $ref = -1; foreach (@words){ $ref++; @index[$ref]=$browser->get ("http://search.businessweek.com/[EMAIL PROTECTED]&skin=Busines sWeek&x=9&y=5"); $p = new HTML::SimpleParse( $index[$ref] ); open(OUTFILE, ">output[$ref].txt") or die "Can't open output.txt: $!"; $flag = 0; $test=0; foreach ($p->tree) { if ($p->execute($_) =~ /Results /) { $flag=1; } if ($flag==1) { $test++; print OUTFILE $p->execute($_); if ($p->execute($_) =~ /Result page/) { $flag = 0;} } } print "There were $test lines saved for parsing for @words[$ref] \n"; close OUTFILE; open(INFILE, "output[$ref].txt") or die "Can't open output.txt: $!"; open(OUTFILE, ">goodies[$ref].txt") or die "Can't open goodies.txt: $!"; while (<INFILE>) { if ($_ =~ /<a href/ ) { ($url,$BetweenTheBold) = $_ =~ /.*'(.*)'.*<b>(.*)<\/b>/ ; print OUTFILE "$url\n"; print OUTFILE "$BetweenTheBold\n"; } elsif ($_ =~ /\d{2}/ ) {($date) = $_ =~ /-.*((January|February|September|November|December|March|April|May|June|July |Augu st|October).{2}.*\d{4}).*/ ; print OUTFILE "$date\n\n"; } } close INFILE; close OUTFILE; } my $var=-1; open(OUTFILE, ">total.txt") or die "Can't open total.txt: $!"; while ($var < $ref) { $var++; open(INFILE, "goodies[$var].txt") or die "Can't open goodies.txt: $!"; while (<INFILE>) {if ($_ =~ /\w/) {print OUTFILE $_;} } close INFILE; DeleteFile ("goodies[$var].txt"); DeleteFile ("output[$var].txt"); } close OUTFILE; ( Andrew Johnson ) ) Marketing Writer ( ( Elias/Savion Advertising ) ( Phone: 412.642.7700 Fax 412.642.2277 ) ) www.elias-savion.com ( ( [EMAIL PROTECTED] )