Alternative to
GetUrl - PostUrl -
Looking in the
documentation of HTML::Request and code outlines some sample
code
my
$ua = LWP::UserAgent->new;
my $url = url
'http://www.altavista.digital.com/';
my $res = $ua->request(GET
$url);
my $tb = HTML::TreeBuilder->new;
$tb->parse($res->content);
my @forms =
@{$tb->extract_links(qw(FORM))};
my $f =
HTTP::Request::Form->new($forms[0][1], $url);
$f->field("q",
"perl");
my $response =
$ua->request($f->press("search"));
print $response->content
if ($response->is_success);
It seems like it's
possible to add another function in the AcquistionHandlers.pm that would closely
resemble the GetUrl function:
I am new
to perl so any help in this area would be appreciated. I have the source code
for GetUrl below renamed to GetForm. Any comments in <bold> are where I see the possible changes need
to be made. Can anyone add to or confirm this?
sub
GetForm($)
{
use
HTML::TreeBuilder;
use URI::URL;
use
LWP::UserAgent;
use HTTP::Request;
use
HTTP::Request::Common;
use
HTTP::Request::Form;
my $url = shift;
unless
(defined($url) && ($url ne ''))
{
dprint
"GetUrl couldn't get data. Proper URL not supplied.";
$errors{'acquisition'} .=
"GetUrl couldn't get
data. Proper URL not supplied.\n";
return undef;
}
dprint
"GetUrl is getting URL:\n $url";
# Try to
get the cached data if it's available and still valid. First try to
#
get the update times for the handler.
my @updateTimes =
@NewsClipper::Interpreter::update_times;
# If the
user specified "always", there's no need to check the time.
if
(lc($updateTimes[0]) eq 'always')
{
dprint
"\"Always\" specified. Skipping cache check"
}
# See if we
can just return the current cached data.
else
{
my ($cachedData,$cacheStatus)
=
$NewsClipper::Globals::cache->get($url,@updateTimes);
# If the data is still valid for the @updateTimes, we can return
it
# immediately (There's no need to bless this since it's
just a SCALAR)
return \$cachedData if $cacheStatus eq
'valid';
# Now see if the remote content hasn't changed. If it hasn't, we
can
# just return the cached data.
if ($cacheStatus eq 'stale')
{
my $last_modified =
_GetLastModifiedTime($url);
my $cached_time =
$NewsClipper::Globals::cache->GetTimeCached($url);
if (defined $last_modified && defined $cached_time
&&
$cached_time > $last_modified)
{
return
\$cachedData
}
}
}
# Otherwise
we'll have to fetch it.
$userAgent->timeout($config{socketTimeout});
$userAgent->proxy(['http', 'ftp'], $config{proxy})
if
$config{proxy} ne '';
# the folowing line could be
changed
# my $request = new
HTTP::Request GET => "$url";
# to something like the
following lines:
my $res = $ua->request(GET
$url);
my $tb = HTML::TreeBuilder->new;
$tb->parse($res->content);
my @forms =
@{$tb->extract_links(qw(FORM))};
# the numbers specific to the form could be passed in a
parameters.
my $f =
HTTP::Request::Form->new($forms[0][1], $url);
$f->field("q",
"perl");
# not sure about the syntax here. Does this mean there should be
button called "Search" on the form?
my $request =
$ua->request($f->press("search"));
$request ->content if
($response->is_success);
# We'll
look like Netscape to the servers. Some servers return different
#
information depending on whether you are a browser or not.
#
$userAgent->agent('Mozilla/4.03');
# Reload
content if the user wants it
$request->push_header("Pragma" =>
"no-cache") if exists $opts{r};
if
($config{proxy_username} ne '')
{
$request->proxy_authorization_basic($config{proxy_username},
$config{proxy_password});
}
# Tell the
server not to send the data if it hasn't been modified. This is
# an
extra check in addition to the last modified check above.
{
my $cached_time =
$NewsClipper::Globals::cache->GetTimeCached($url);
$request->if_modified_since($cached_time);
}
my
$result;
my $numTriesLeft = $config{socketTries};
do
{
$result =
$userAgent->request($request);
$numTriesLeft--;
} until ($numTriesLeft == 0 ||
$result->is_success);
# If the
server reports that the data hasn't changed
if
(!$result->is_success && $result->code == 304)
{
dprint "Server reports data hasn't
changed.";
my ($cachedData,$cacheStatus) =
$NewsClipper::Globals::cache->get($url,@updateTimes);
# Use the cached data if there is any available
if
($cacheStatus eq 'stale')
{
dprint "Using cached
data.";
# No need to bless this since it's
just a SCALAR
return
\$cachedData;
}
else
{
dprint "There is
no cached data available";
return
undef;
}
}
elsif
(!$result->is_success)
{
dprint "Couldn't get
data. Error on HTTP request: \"" . $result->message
.
"\"";
$errors{'acquisition'} .= "Error on HTTP request: \""
.
$result->message . "\".\n"
if defined
$result;
my ($cachedData,$cacheStatus) =
$NewsClipper::Globals::cache->get($url,@updateTimes);
# Use the cached data if there is any available
if
($cacheStatus eq 'stale')
{
dprint "HTTP request failed, but there is
cached data available.";
$errors{'acquisition'} .= "Using cached
data.\n";
# No need to bless this since it's
just a SCALAR
return
\$cachedData;
}
else
{
dprint "HTTP
request failed, and there is no cached data
available";
$errors{'acquisition'} .= "There
is no cached data.\n";
return
undef;
}
}
my $content
= $result->content;
# Strip
linefeeds off the lines if the content looks non-binary
$content =~
s/\r//gs if $content !~ /\000/;
# Cache it
for later use, even if "always" was specified. (We use the cached
#
data to determine when the info was last fetched. Also, the person may
# change the handler to something other than "always".)
$NewsClipper::Globals::cache->set($url,$content);
# No need
to bless this since it's just a SCALAR
return
\$content;
}
----------------------------------------------------------------------
Best viewed with courier 10 ppt. Font
.---. .---------- John
Menke
/ \ __
/ ------ JCN Capital
/
/ \( )/ -----
Eagle Information Systems
////// ' \/ `
--- Research Consultant - Web Developer
//// / // : : --- Home Phone No.
908-876-1538
// / / /`
'-- Work Phone No.
914-627-1115
//
///..\\\ Email:
[EMAIL PROTECTED]
==UU====UU== http://www.eagleinfosystems.com