moseley     02/04/19 23:13:05

  Modified:    src/search spider.pl
  Log:
  Add debugging and quiet mode for the spider.
  
  perldoc spider.pl
  
  Revision  Changes    Path
  1.4       +47 -7     modperl-docs/src/search/spider.pl
  
  Index: spider.pl
  ===================================================================
  RCS file: /home/cvs/modperl-docs/src/search/spider.pl,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- spider.pl 3 Mar 2002 11:27:22 -0000       1.3
  +++ spider.pl 20 Apr 2002 06:13:05 -0000      1.4
  @@ -2,7 +2,7 @@
   use strict;
   
   
  -# $Id: spider.pl,v 1.3 2002/03/03 11:27:22 stas Exp $
  +# $Id: spider.pl,v 1.4 2002/04/20 06:13:05 moseley Exp $
   #
   # "prog" document source for spidering web servers
   #
  @@ -23,7 +23,7 @@
   use HTML::Tagset;
   
   use vars '$VERSION';
  -$VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
  +$VERSION = sprintf '%d.%02d', q$Revision: 1.4 $ =~ /: (\d+)\.(\d+)/;
   
   use vars '$bit';
   use constant DEBUG_ERRORS   => $bit = 1;    # program errors
  @@ -34,6 +34,17 @@
   use constant DEBUG_INFO     => $bit <<= 1;  # more verbose
   use constant DEBUG_LINKS    => $bit <<= 1;  # prints links as they are 
extracted
   
  +my %DEBUG_MAP = (
  +    errors  => DEBUG_ERRORS,
  +    url     => DEBUG_URL,
  +    headers => DEBUG_HEADERS,
  +    failed  => DEBUG_FAILED,
  +    skipped => DEBUG_SKIPPED,
  +    info    => DEBUG_INFO,
  +    links   => DEBUG_LINKS,
  +);
  +    
  +
   
   use constant MAX_SIZE       => 5_000_000;   # Max size of document to fetch
   use constant MAX_WAIT_TIME  => 30;          # request time.
  @@ -62,7 +73,7 @@
       }
   
   
  -    print STDERR "$0: Reading parameters from '$config'\n";
  +    print STDERR "$0: Reading parameters from '$config'\n" unless 
$ENV{SPIDER_QUIET};
   
       my $abort;
       local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~ 
/Win32/i;
  @@ -103,8 +114,18 @@
   
       # set defaults
   
  -    $server->{debug} ||= 0;
  -    die "debug parameter '$server->{debug}' must be a number\n" unless 
$server->{debug} =~ /^\d+$/;
  +    if ( $ENV{SPIDER_DEBUG} ) {
  +        $server->{debug} = 0;
  +
  +        $server->{debug} |= (exists $DEBUG_MAP{lc $_} ? $DEBUG_MAP{lc $_} : 
die "Bad debug setting passed in environment '$_'\nOptions are: " . join( ', ', 
keys %DEBUG_MAP) ."\n")
  +            for split /\s*,\s*/, $ENV{SPIDER_DEBUG};
  +
  +    } else {
  +        $server->{debug} ||= 0;
  +        die "debug parameter '$server->{debug}' must be a number\n" unless 
$server->{debug} =~ /^\d+$/;
  +    }
  +
  +    $server->{quiet} ||= $ENV{SPIDER_QUIET} || 0;
   
   
       $server->{max_size} ||= MAX_SIZE;
  @@ -137,7 +158,7 @@
       my $start = time;
   
       if ( $server->{skip} ) {
  -        print STDERR "Skipping: $server->{base_url}\n";
  +        print STDERR "Skipping: $server->{base_url}\n" unless 
$server->{quiet};
           return;
       }
   
  @@ -235,6 +256,9 @@
       eval { spider( $server, $uri ) };
       print STDERR $@ if $@;
   
  +    return if $server->{quiet};
  +
  +
       $start = time - $start;
       $start++ unless $start;
   
  @@ -246,6 +270,7 @@
           $max_num = length $val if length $val > $max_num;
       }
   
  +
       printf STDERR "\nSummary for: $server->{base_url}\n";
   
       for ( sort keys %{$server->{counts}} ) {
  @@ -468,7 +493,7 @@
           return;
       }
   
  -    $response->request->uri->userinfo( undef );
  +    $response->request->uri->userinfo( undef ) if $response->request;
   
   
       # skip excluded by robots.txt
  @@ -1339,6 +1364,21 @@
   
   And you will see debugging info as it runs, and the fetched documents will 
be saved
   in the C<spider.out> file.
  +
  +Debugging can be also be set by an environment variable when running swish.  
This will
  +override any setting in the configuration file.  Set the variable 
SPIDER_DEBUG when running
  +the spider.  You can specify any of the above debugging options, separated 
by a comma.
  +
  +For example with Bourne type shell:
  +
  +    SPIDER_DEBUG=url,links
  +
  +=item quiet
  +
  +If this is true then normal, non-error messages will be supressed.  Quiet 
mode can also
  +be set by setting the environment variable SPIDER_QUIET to any true value.
  +
  +    SPIDER_QUIET=1
   
   =item max_depth
   
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to