At 08:05 AM 06/07/01 -0700, Bill Moseley wrote:
>Not a lot of details here, Sorry.  But I'm just wondering if this sounds
>familiar to anyone.
>
>I've got a spider program that when run on FreeBSD 2.2.8, perl 5.6.1, and 
>LWP::UserAgent 1.80 (not sure which LWP bundle that is) it uses a huge
>amount of memory after just about 70 documents have been requested:

Ok, here's some more detail and a code sample.

The program spiders a web site recursively.  The recursive routine is 
called process_link( $server, $uri ) which takes a ref to a hash, and a 
URI object.  The hash is just a list of parameters (poor-mans object).

process_link() reads a doc, then it extracts out the links by calling extract_links( 
$server, $content, $response ) which uses HTML::LinkExtor.
Passed: $server is the hash ref again, a reference to the content, and the
response object.

extract_links() returns a reference to an array of extracted links.
These are URI objects.

Here's the deal.  Running like above eats memory like crazy on 
FreeBSD&5.6.1, but not on Linux&5.6.0.  Both have very current perl libraries and 
LWP/URI modules installed.

BUT, if instead I change process_link() to take a scalar URL, and 
extract_links() to return a list of scalar URLs, then it doesn't 
eat memory.

Here's the ps output on the freeBSD machine:


spider35.pl -- (stopped at: recurse depth = 31 number URLs spidered = 67 )

ps -aux -p 21557
USER    PID      %CPU  %MEM   VSZ         RSS      TT  STAT STARTED   
TIME       COMMAND
root     21557  39.7    55.3      85832  34848  p0  D+    1:21PM   
0:34.66  perl spider35.pl > /dev/null

spider45.pl -- (stopped at: recurse depth = 273 number URLs spidered = 2069 )

ps -aux -p 21360
USER   PID       %CPU  %MEM   VSZ       RSS     TT  STAT STARTED   TIME 
   COMMAND
root     21360 18.7    14.5       8648   9132  p0  S+    1:10PM   
1:49.17 perl spider45.pl > /dev/null

Big difference.  Again, under Linux & perl 5.6.0 this does not happen.

(and I sure wish he wasn't running my test scripts as root ;)





I suspect that the URI object issue is a red herring, and rather something
in perl.  I have found a work-around for this problem, but I'd like to find
out what exactly is the problem, and if it seems like 5.6.1, get that 
reported.

Can anyone else reproduce the memory usage problem with this script?

Here's the script (the one that eats memory) and a diff 
to one that doesn't eat memory.  

[This is also an opportunity for help with "code refinement" ;).  A recursive
spider has its limitations, so if anyone has a suggestion...]



I've turned off line wrap ---------->


#!/usr/local/bin/perl -w
use strict;
use LWP::RobotUA;
use HTML::LinkExtor;

my $depth = 0; # recursion depth
my %visited;   # track which URLs were seen
    
process_server( $STARTING_URL );

sub process_server {
    my $server = {
        base_url        => shift,
        email           => '[EMAIL PROTECTED]',
        agent           => 'agent test',
        delay_min       => .0001,
        link_tags       => [qw/ a frame /],
    };

    # set starting URL, and remove any specified fragment
    my $uri = URI->new( $server->{base_url} );
    $uri->fragment(undef);

    # set the starting server name (including port) -- will only spider on server:port
    
    $server->{authority} = $uri->authority;
    $server->{same} = [ $uri->authority ];


    # get a user agent object
    
    my  $ua = LWP::RobotUA->new( $server->{agent}, $server->{email} );
    $ua->delay( $server->{delay_min} || 0.1 );
    $ua->parse_head(0);   # Don't parse the content
    $server->{ua} = $ua;  # save it


    eval { process_link( $server, $uri ) };

    print STDERR $@ if $@;
}    
        

my $parent;
#----------- Process a url and recurse -----------------------
sub process_link {
    my ( $server, $uri ) = @_;

    if ( $visited{ $uri->canonical }++ ) {
        return;
    }
    $server->{counts}{'Unique URLs'}++;


    # make request
    my $ua = $server->{ua};
    my $request = HTTP::Request->new('GET', $uri );


    my $content = '';

    my $been_here;
    my $callback = sub {
        die "wrong type"
            unless $been_here++ || grep { $_[1]->content_type() eq $_ } qw{ text/html 
text/plain };

        $content .= $_[0];

    };

    my $response = $ua->simple_request( $request, $callback, 4096 );


    # Log the response
    
    print STDERR '>> ',
      join( ' ',
            ( $response->is_success ? '+Fetched' : '-Failed' ),
            $depth,
            "Cnt: $server->{counts}{'Unique URLs'}",
            $response->request->uri->canonical,
            $response->code,
            $response->content_type,
            $response->content_length,
       ),"\n";

    if ( !$response->is_success && $parent ) {
        print STDERR "   Found on page: ", $parent,"\n";
    }

    # If the LWP callback aborts

    if ( $response->header('client-aborted') ) {
        return;
    }
    

    # skip excluded by robots.txt
    
    if ( !$response->is_success && $response->status_line =~ 'robots.txt' ) {
        return;
    }

    unless ( $response->is_success ) {

        # look for redirect
        if ( $response->is_redirect && $response->header('location') ) {
            my $u = URI->new_abs( $response->header('location'), $response->base );
            process_link( $server, $u );
        }
        return;
    }

    return unless $content;  # $$$ any reason to index empty files?

    my $links = extract_links( $server, \$content, $response );

    # Now spider
    my $last_page = $parent || '';
    $parent = $uri;
    $depth++;

    if ( ! defined $server->{max_depth} || $server->{max_depth} >= $depth ) {
        process_link( $server, $_ ) for @$links;
    }

    $depth--;
    $parent = $last_page;

}
sub extract_links {
    my ( $server, $content, $response ) = @_;

    return [] unless $response->header('content-type') &&
                     $response->header('content-type') =~ m[^text/html];


    $server->{Spidered}++;

    my @links;


    my $base = $response->base;

    my $p = HTML::LinkExtor->new;
    $p->parse( $$content );

    my %skipped_tags;
    for ( $p->links ) {
        my ( $tag, %attr ) = @$_;

        # which tags to use ( not reported in debug )

        unless ( grep { $tag eq $_ } @{$server->{link_tags}} ) {
            next;
        }
        
        # which are valid link
        my $links = $HTML::Tagset::linkElements{$tag};
        $links = [$links] unless ref $links;

        my $found;
        my %seen;
        for ( @$links ) {
            if ( $attr{ $_ } ) {  # ok tag

                my $u = URI->new_abs( $attr{$_},$base );
                $u->fragment( undef );

                return if $seen{$u}++;  # this won't report duplicates

                unless ( $u->scheme =~ /^http$/ ) {  # no https at this time.
                    next;
                }

                unless ( $u->host ) {
                    next;
                }

                unless ( grep { $u->authority eq $_ } @{$server->{same}} ) {
                    $server->{counts}{'Off-site links'}++;
                    next;
                }
                
                $u->authority( $server->{authority} );  # Force all the same host name

                push @links, $u;
                $found++;

            }
        }
    }

    return \@links;
}


---------- And the diff to the version that doesn't eat memory -------------

> diff -u spider35.pl spider45.pl
--- spider35.pl Fri Jun  8 10:04:26 2001
+++ spider45.pl Fri Jun  8 10:04:06 2001
@@ -44,7 +44,9 @@
 my $parent;
 #----------- Process a url and recurse -----------------------
 sub process_link {
-    my ( $server, $uri ) = @_;
+    my ( $server, $url ) = @_;
+
+    my $uri = URI->new( $url );
 
     if ( $visited{ $uri->canonical }++ ) {
         return;
@@ -184,7 +186,9 @@
                 
                 $u->authority( $server->{authority} );  # Force all the same host name
 
-                push @links, $u;
+                my $z = $u->as_string;
+
+                push @links, $z;
                 $found++;
 
             }

Bill Moseley
mailto:[EMAIL PROTECTED]

Reply via email to