John Menke
Mon, 11 Sep 2000 10:38:14 -0700
-----Original Message-----
From: John Menke [mailto:[EMAIL PROTECTED]]
Sent: Monday, September 11, 2000 1:31 PM
To: Newsclipperdevlist
Subject: Help with Alternative to GetUrlAlternative to GetUrl - PostUrl -Looking in the documentation of HTML::Request and code outlines some sample codemy $ua = LWP::UserAgent->new;
my $url = url 'http://www.altavista.digital.com/';
my $res = $ua->request(GET $url);
my $tb = HTML::TreeBuilder->new;
$tb->parse($res->content);
my @forms = @{$tb->extract_links(qw(FORM))};
my $f = HTTP::Request::Form->new($forms[0][1], $url);
$f->field("q", "perl");
my $response = $ua->request($f->press("search"));
print $response->content if ($response->is_success);It seems like it's possible to add another function in the AcquistionHandlers.pm that would closely resemble the GetUrl function:I am new to perl so any help in this area would be appreciated. I have the source code for GetUrl below renamed to GetForm. Any comments in <bold> are where I see the possible changes need to be made. Can anyone add to or confirm this?sub GetForm($)
{use HTML::TreeBuilder;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Request::Common;
use HTTP::Request::Form;
my $url = shift;unless (defined($url) && ($url ne ''))
{
dprint "GetUrl couldn't get data. Proper URL not supplied.";
$errors{'acquisition'} .=
"GetUrl couldn't get data. Proper URL not supplied.\n";
return undef;
}dprint "GetUrl is getting URL:\n $url";# Try to get the cached data if it's available and still valid. First try to
# get the update times for the handler.
my @updateTimes = @NewsClipper::Interpreter::update_times;# If the user specified "always", there's no need to check the time.
if (lc($updateTimes[0]) eq 'always')
{
dprint "\"Always\" specified. Skipping cache check"
}
# See if we can just return the current cached data.
else
{
my ($cachedData,$cacheStatus) =
$NewsClipper::Globals::cache->get($url,@updateTimes);# If the data is still valid for the @updateTimes, we can return it
# immediately (There's no need to bless this since it's just a SCALAR)
return \$cachedData if $cacheStatus eq 'valid';# Now see if the remote content hasn't changed. If it hasn't, we can
# just return the cached data.
if ($cacheStatus eq 'stale')
{
my $last_modified = _GetLastModifiedTime($url);
my $cached_time = $NewsClipper::Globals::cache->GetTimeCached($url);
if (defined $last_modified && defined $cached_time &&
$cached_time > $last_modified)
{
return \$cachedData
}
}
}# Otherwise we'll have to fetch it.
$userAgent->timeout($config{socketTimeout});
$userAgent->proxy(['http', 'ftp'], $config{proxy})
if $config{proxy} ne '';# the folowing line could be changed
# my $request = new HTTP::Request GET => "$url";# to something like the following lines:my $res = $ua->request(GET $url);
my $tb = HTML::TreeBuilder->new;
$tb->parse($res->content);
my @forms = @{$tb->extract_links(qw(FORM))};# the numbers specific to the form could be passed in a parameters.
my $f = HTTP::Request::Form->new($forms[0][1], $url);
$f->field("q", "perl");# not sure about the syntax here. Does this mean there should be button called "Search" on the form?
my $request = $ua->request($f->press("search"));
$request ->content if ($response->is_success);
# We'll look like Netscape to the servers. Some servers return different
# information depending on whether you are a browser or not.
# $userAgent->agent('Mozilla/4.03');# Reload content if the user wants it
$request->push_header("Pragma" => "no-cache") if exists $opts{r};if ($config{proxy_username} ne '')
{
$request->proxy_authorization_basic($config{proxy_username},
$config{proxy_password});
}# Tell the server not to send the data if it hasn't been modified. This is
# an extra check in addition to the last modified check above.
{
my $cached_time = $NewsClipper::Globals::cache->GetTimeCached($url);
$request->if_modified_since($cached_time);
}my $result;
my $numTriesLeft = $config{socketTries};do
{
$result = $userAgent->request($request);
$numTriesLeft--;
} until ($numTriesLeft == 0 || $result->is_success);# If the server reports that the data hasn't changed
if (!$result->is_success && $result->code == 304)
{
dprint "Server reports data hasn't changed.";my ($cachedData,$cacheStatus) =
$NewsClipper::Globals::cache->get($url,@updateTimes);# Use the cached data if there is any available
if ($cacheStatus eq 'stale')
{
dprint "Using cached data.";
# No need to bless this since it's just a SCALAR
return \$cachedData;
}
else
{
dprint "There is no cached data available";return undef;
}
}
elsif (!$result->is_success)
{
dprint "Couldn't get data. Error on HTTP request: \"" . $result->message .
"\"";
$errors{'acquisition'} .= "Error on HTTP request: \"" .
$result->message . "\".\n"
if defined $result;my ($cachedData,$cacheStatus) =
$NewsClipper::Globals::cache->get($url,@updateTimes);# Use the cached data if there is any available
if ($cacheStatus eq 'stale')
{
dprint "HTTP request failed, but there is cached data available.";
$errors{'acquisition'} .= "Using cached data.\n";
# No need to bless this since it's just a SCALAR
return \$cachedData;
}
else
{
dprint "HTTP request failed, and there is no cached data available";
$errors{'acquisition'} .= "There is no cached data.\n";return undef;
}
}my $content = $result->content;# Strip linefeeds off the lines if the content looks non-binary
$content =~ s/\r//gs if $content !~ /\000/;# Cache it for later use, even if "always" was specified. (We use the cached
# data to determine when the info was last fetched. Also, the person may
# change the handler to something other than "always".)
$NewsClipper::Globals::cache->set($url,$content);# No need to bless this since it's just a SCALAR
return \$content;
}----------------------------------------------------------------------
Best viewed with courier 10 ppt. Font
.---. .---------- John Menke
/ \ __ / ------ JCN Capital
/ / \( )/ ----- Eagle Information Systems
////// ' \/ ` --- Research Consultant - Web Developer
//// / // : : --- Home Phone No. 908-876-1538
// / / /` '-- Work Phone No. 914-627-1115
// ///..\\\ Email: [EMAIL PROTECTED]
==UU====UU== http://www.eagleinfosystems.com