moseley 02/04/19 23:13:05 Modified: src/search spider.pl Log: Add debugging and quiet mode for the spider. perldoc spider.pl Revision Changes Path 1.4 +47 -7 modperl-docs/src/search/spider.pl Index: spider.pl =================================================================== RCS file: /home/cvs/modperl-docs/src/search/spider.pl,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- spider.pl 3 Mar 2002 11:27:22 -0000 1.3 +++ spider.pl 20 Apr 2002 06:13:05 -0000 1.4 @@ -2,7 +2,7 @@ use strict; -# $Id: spider.pl,v 1.3 2002/03/03 11:27:22 stas Exp $ +# $Id: spider.pl,v 1.4 2002/04/20 06:13:05 moseley Exp $ # # "prog" document source for spidering web servers # @@ -23,7 +23,7 @@ use HTML::Tagset; use vars '$VERSION'; -$VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/; +$VERSION = sprintf '%d.%02d', q$Revision: 1.4 $ =~ /: (\d+)\.(\d+)/; use vars '$bit'; use constant DEBUG_ERRORS => $bit = 1; # program errors @@ -34,6 +34,17 @@ use constant DEBUG_INFO => $bit <<= 1; # more verbose use constant DEBUG_LINKS => $bit <<= 1; # prints links as they are extracted +my %DEBUG_MAP = ( + errors => DEBUG_ERRORS, + url => DEBUG_URL, + headers => DEBUG_HEADERS, + failed => DEBUG_FAILED, + skipped => DEBUG_SKIPPED, + info => DEBUG_INFO, + links => DEBUG_LINKS, +); + + use constant MAX_SIZE => 5_000_000; # Max size of document to fetch use constant MAX_WAIT_TIME => 30; # request time. @@ -62,7 +73,7 @@ } - print STDERR "$0: Reading parameters from '$config'\n"; + print STDERR "$0: Reading parameters from '$config'\n" unless $ENV{SPIDER_QUIET}; my $abort; local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~ /Win32/i; @@ -103,8 +114,18 @@ # set defaults - $server->{debug} ||= 0; - die "debug parameter '$server->{debug}' must be a number\n" unless $server->{debug} =~ /^\d+$/; + if ( $ENV{SPIDER_DEBUG} ) { + $server->{debug} = 0; + + $server->{debug} |= (exists $DEBUG_MAP{lc $_} ? $DEBUG_MAP{lc $_} : die "Bad debug setting passed in environment '$_'\nOptions are: " . join( ', ', keys %DEBUG_MAP) ."\n") + for split /\s*,\s*/, $ENV{SPIDER_DEBUG}; + + } else { + $server->{debug} ||= 0; + die "debug parameter '$server->{debug}' must be a number\n" unless $server->{debug} =~ /^\d+$/; + } + + $server->{quiet} ||= $ENV{SPIDER_QUIET} || 0; $server->{max_size} ||= MAX_SIZE; @@ -137,7 +158,7 @@ my $start = time; if ( $server->{skip} ) { - print STDERR "Skipping: $server->{base_url}\n"; + print STDERR "Skipping: $server->{base_url}\n" unless $server->{quiet}; return; } @@ -235,6 +256,9 @@ eval { spider( $server, $uri ) }; print STDERR $@ if $@; + return if $server->{quiet}; + + $start = time - $start; $start++ unless $start; @@ -246,6 +270,7 @@ $max_num = length $val if length $val > $max_num; } + printf STDERR "\nSummary for: $server->{base_url}\n"; for ( sort keys %{$server->{counts}} ) { @@ -468,7 +493,7 @@ return; } - $response->request->uri->userinfo( undef ); + $response->request->uri->userinfo( undef ) if $response->request; # skip excluded by robots.txt @@ -1339,6 +1364,21 @@ And you will see debugging info as it runs, and the fetched documents will be saved in the C<spider.out> file. + +Debugging can be also be set by an environment variable when running swish. This will +override any setting in the configuration file. Set the variable SPIDER_DEBUG when running +the spider. You can specify any of the above debugging options, separated by a comma. + +For example with Bourne type shell: + + SPIDER_DEBUG=url,links + +=item quiet + +If this is true then normal, non-error messages will be supressed. Quiet mode can also +be set by setting the environment variable SPIDER_QUIET to any true value. + + SPIDER_QUIET=1 =item max_depth
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]