newsclipperdevlist  

RE: Help with Alternative to GetUrl

John Menke
Mon, 11 Sep 2000 10:38:14 -0700

I forgot to edit the section on the fields...  Fields could be passed to GetForm function (maybe in an array of Hashes???)  The function defintion would look like:
 
PostURL($url, %fields)
-----Original Message-----
From: John Menke [mailto:[EMAIL PROTECTED]]
Sent: Monday, September 11, 2000 1:31 PM
To: Newsclipperdevlist
Subject: Help with Alternative to GetUrl

Alternative to GetUrl - PostUrl -
 
 
Looking in the documentation of HTML::Request and code outlines some sample code
 
 my $ua = LWP::UserAgent->new;
  my $url = url 'http://www.altavista.digital.com/';
  my $res = $ua->request(GET $url);
  my $tb = HTML::TreeBuilder->new;
  $tb->parse($res->content);
  my @forms = @{$tb->extract_links(qw(FORM))};
  my $f = HTTP::Request::Form->new($forms[0][1], $url);
  $f->field("q", "perl");
  my $response = $ua->request($f->press("search"));
  print $response->content if ($response->is_success);
 
It seems like it's possible to add another function in the AcquistionHandlers.pm that would closely resemble the GetUrl function:
 
 
I am new to perl so any help in this area would be appreciated. I have the source code for GetUrl below renamed to GetForm.  Any comments in <bold> are where I see the possible changes need to be made.   Can anyone add to or confirm this? 
 
 
 
 
 
 
sub GetForm($)
{
 
   use HTML::TreeBuilder;
  use URI::URL;
  use LWP::UserAgent;
  use HTTP::Request;
  use HTTP::Request::Common;
  use HTTP::Request::Form;
 

  my $url = shift;
 
  unless (defined($url) && ($url ne ''))
  {
    dprint "GetUrl couldn't get data. Proper URL not supplied.";
    $errors{'acquisition'} .=
      "GetUrl couldn't get data. Proper URL not supplied.\n";
    return undef;
  }
 
  dprint "GetUrl is getting URL:\n  $url";
 
  # Try to get the cached data if it's available and still valid. First try to
  # get the update times for the handler.
  my @updateTimes = @NewsClipper::Interpreter::update_times;
 
  # If the user specified "always", there's no need to check the time.
  if (lc($updateTimes[0]) eq 'always')
  {
    dprint "\"Always\" specified. Skipping cache check"
  }
  # See if we can just return the current cached data.
  else
  {
    my ($cachedData,$cacheStatus) =
      $NewsClipper::Globals::cache->get($url,@updateTimes);
 
    # If the data is still valid for the @updateTimes, we can return it
    # immediately (There's no need to bless this since it's just a SCALAR)
    return \$cachedData if $cacheStatus eq 'valid';
 
    # Now see if the remote content hasn't changed. If it hasn't, we can
    # just return the cached data.
    if ($cacheStatus eq 'stale')
    {
      my $last_modified = _GetLastModifiedTime($url);
      my $cached_time = $NewsClipper::Globals::cache->GetTimeCached($url);
      if (defined $last_modified && defined $cached_time &&
          $cached_time > $last_modified)
      {
        return \$cachedData
      }
    }
  }
 
  # Otherwise we'll have to fetch it.
  $userAgent->timeout($config{socketTimeout});
  $userAgent->proxy(['http', 'ftp'], $config{proxy})
    if $config{proxy} ne '';
 
   # the folowing line could be changed

  # my $request = new HTTP::Request GET => "$url";
 
  # to something like the following lines:
 
  my $res = $ua->request(GET $url);
  my $tb = HTML::TreeBuilder->new;
  $tb->parse($res->content);
  my @forms = @{$tb->extract_links(qw(FORM))};
 
# the numbers specific to the form could be passed in a parameters.

  my $f = HTTP::Request::Form->new($forms[0][1], $url);
  $f->field("q", "perl");
 
# not sure about the syntax here. Does this mean there should be button called "Search" on the form?

  my $request = $ua->request($f->press("search"));
  $request ->content if ($response->is_success);


 
  
 
 
   
  # We'll look like Netscape to the servers. Some servers return different
  # information depending on whether you are a browser or not.
#  $userAgent->agent('Mozilla/4.03');
 
  # Reload content if the user wants it
  $request->push_header("Pragma" => "no-cache") if exists $opts{r};
 
  if ($config{proxy_username} ne '')
  {
    $request->proxy_authorization_basic($config{proxy_username},
                     $config{proxy_password});
  }
 
  # Tell the server not to send the data if it hasn't been modified. This is
  # an extra check in addition to the last modified check above.
  {
    my $cached_time = $NewsClipper::Globals::cache->GetTimeCached($url);
    $request->if_modified_since($cached_time);
  }
 
  my $result;
  my $numTriesLeft = $config{socketTries};
 
  do
  {
    $result = $userAgent->request($request);
    $numTriesLeft--;
  } until ($numTriesLeft == 0 || $result->is_success);
 
  # If the server reports that the data hasn't changed
  if (!$result->is_success && $result->code == 304)
  {
    dprint "Server reports data hasn't changed.";
 
    my ($cachedData,$cacheStatus) =
      $NewsClipper::Globals::cache->get($url,@updateTimes);
 
    # Use the cached data if there is any available
    if ($cacheStatus eq 'stale')
    {
      dprint "Using cached data.";
      # No need to bless this since it's just a SCALAR
      return \$cachedData;
    }
    else
    {
      dprint "There is no cached data available";
 
      return undef;
    }
  }
  elsif (!$result->is_success)
  {
    dprint "Couldn't get data. Error on HTTP request: \"" . $result->message .
      "\"";
    $errors{'acquisition'} .= "Error on HTTP request: \"" .
                              $result->message . "\".\n"
      if defined $result;
 
    my ($cachedData,$cacheStatus) =
      $NewsClipper::Globals::cache->get($url,@updateTimes);
 
    # Use the cached data if there is any available
    if ($cacheStatus eq 'stale')
    {
      dprint "HTTP request failed, but there is cached data available.";
      $errors{'acquisition'} .= "Using cached data.\n";
      # No need to bless this since it's just a SCALAR
      return \$cachedData;
    }
    else
    {
      dprint "HTTP request failed, and there is no cached data available";
      $errors{'acquisition'} .= "There is no cached data.\n";
 
      return undef;
    }
  }
 
  my $content = $result->content;
 
  # Strip linefeeds off the lines if the content looks non-binary
  $content =~ s/\r//gs if $content !~ /\000/;
 
  # Cache it for later use, even if "always" was specified. (We use the cached
  # data to determine when the info was last fetched. Also, the person may
  # change the handler to something other than "always".)
  $NewsClipper::Globals::cache->set($url,$content);
 
  # No need to bless this since it's just a SCALAR
  return \$content;
}
 
 
 
 

----------------------------------------------------------------------
                Best viewed with courier 10 ppt. Font

      .---.        .----------   John Menke
     /     \  __  /    ------   JCN Capital
    / /     \(  )/    -----    Eagle Information Systems
   //////   ' \/ `   ---      Research Consultant - Web Developer
  //// / // :    : ---      Home Phone No. 908-876-1538
 // /   /  /`    '--       Work Phone No. 914-627-1115
//         ///..\\\       Email: [EMAIL PROTECTED]
         ==UU====UU==    http://www.eagleinfosystems.com