stas 02/03/03 03:27:22 Modified: src/search .swishcgi.conf README SwishSpiderConfig.pl search.tt spider.pl swish.cgi swish.conf Log: - updating search utility and configs Submitted by: Bill Moseley <[EMAIL PROTECTED]> Reviewed by: stas Revision Changes Path 1.2 +16 -0 modperl-docs/src/search/.swishcgi.conf Index: .swishcgi.conf =================================================================== RCS file: /home/cvs/modperl-docs/src/search/.swishcgi.conf,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- .swishcgi.conf 30 Jan 2002 06:35:00 -0000 1.1 +++ .swishcgi.conf 3 Mar 2002 11:27:21 -0000 1.2 @@ -6,5 +6,21 @@ options => { INCLUDE_PATH => '.', }, + }, + select_by_meta => { + #method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group + method => 'checkbox_group', + #method => 'popup_menu', + columns => 6, + metaname => 'section', # Can't be a metaname used elsewhere! + values => [qw/about contribute docs download maillist products stats stories support/], + labels => { + about => 'About mod_perl', + doc => 'Documentation', + stories => 'Sucess Stories', + support => 'Support', }, + description => 'Limit search to these areas: ', + }, + }; 1.2 +39 -6 modperl-docs/src/search/README Index: README =================================================================== RCS file: /home/cvs/modperl-docs/src/search/README,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- README 4 Feb 2002 09:22:27 -0000 1.1 +++ README 3 Mar 2002 11:27:22 -0000 1.2 @@ -12,20 +12,28 @@ Indexing: --------- -1. normally build the site: +1. Set an environment variable to the path of the site: - % bin/build -f (-d to build pdfs) + export MODPERL_SITE='http://perl.org' -which among other things creates the dir: dst_html/search +or + + export MODPERL_SITE='http://localhost:4000/dst_html' + +This is used as the base for spidering, plus is used to determine +the sections of the site (for limiting the site to those sections. + -2. check that swish.conf points to the right base URL, e.g.: +2. normally build the site: - SwishProgParameters default http://localhost/modperl-site/ + % bin/build -f (-d to build pdfs) + +which among other things creates the dir: dst_html/search 3. Index the site % cd dst_html/search - % swish-e -S prog -c swish.conf + % ./swish-e -S prog -c swish.conf You should see something like: @@ -81,3 +89,28 @@ +How does indexing work? +----------------------- + +Swish is run with a config file, and is run in a mode that says +to use an external program to fetch documents. That external program +is called spider.pl (part of the swish-e distribution). + +spider.pl uses a config file (by default) of SwishSpiderConfig.pl. This file +builds an array of hashes (in this case a sinlge hash in the array). This hash +is the config. + +Part of the config are call-back functions that spider.pl will call while spidering. +One says to skip image files. Another one is a bit more tricky. It splits a document into +sections, creates new "sub-pages" that are complete HTML pages, and calls the function in spider.pl +that sends those off to swish for indexing. (That function then returns false to tell swish not to +index that document since the sections have already been indexed.) + +That's about it. + +One trick. For debugging you can run the spider without indexing. + + ./spider.pl > bigfile.out + +Another trick, you can send SIGHUP to spider.pl while indexing and +it will stop spidering, but let swish index what's been read so far. 1.3 +20 -8 modperl-docs/src/search/SwishSpiderConfig.pl Index: SwishSpiderConfig.pl =================================================================== RCS file: /home/cvs/modperl-docs/src/search/SwishSpiderConfig.pl,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- SwishSpiderConfig.pl 7 Feb 2002 07:26:15 -0000 1.2 +++ SwishSpiderConfig.pl 3 Mar 2002 11:27:22 -0000 1.3 @@ -2,13 +2,19 @@ # # a few custom callbacks are located after the @servers definition section. + + +my $base_path = $ENV{MODPERL_SITE} || die "must set \$ENV{MODPERL_SITE}"; + +die "Don't use trailing slash in MODPERL_SITE" if $base_path =~ m!/$!; + + @servers = ( { - base_url => 'http://mardy:40994/dst_html/index.html', + base_url => "$base_path/index.html", # Debugging -- see perldoc spider.pl - #base_url => 'http://mardy.hank.org:40994/dst_html/docs/guide/index.html', #max_depth => 1, #debug => DEBUG_HEADERS, #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO, @@ -21,12 +27,9 @@ delay_min => .0001, + # Ignore images files - test_url => sub { - return if $_[0]->path =~ /\.(?:gif|jpeg|.png|.gz)$/i; - return unless $_[0]->path =~ m!^/preview/modperl-site!; - return 1; - }, + test_url => sub { return $_[0]->path !~ /\.(?:gif|jpeg|.png|.gz)$/i }, # Only index text/html test_response => sub { return $_[2]->content_type =~ m[text/html] }, @@ -35,7 +38,7 @@ filter_content => \&split_page, # optionally validate external links - validate_links => 1, + validate_links => $ENV{VALIDATE_LINKS} || 0, }, ); @@ -92,11 +95,20 @@ $head->push_content( $title ); } + # Extract out part of the path to use for limiting searches to parts of the document tree. + + if ( $uri =~ m!$base_path/([^/]+)/.+$! ) { + my $meta = HTML::Element->new('meta', name=> 'section', content => $1); + $head->push_content( $meta ); + } + + my $body = HTML::Element->new('body'); my $doc = HTML::Element->new('html'); $body->push_content( $section ); $doc->push_content( $head, $body ); + my $new_content = $doc->as_HTML(undef,"\t"); output_content( $params->{server}, \$new_content, 1.4 +1 -0 modperl-docs/src/search/search.tt Index: search.tt =================================================================== RCS file: /home/cvs/modperl-docs/src/search/search.tt,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- search.tt 4 Feb 2002 07:16:43 -0000 1.3 +++ search.tt 3 Mar 2002 11:27:22 -0000 1.4 @@ -15,6 +15,7 @@ [% PROCESS search_form %] [% PROCESS nav_bar %] [% PROCESS results_list %] + [% IF search.navigation('hits') > search.config('page_size'); PROCESS nav_bar; END %] [% END %] [% END %] 1.3 +222 -30 modperl-docs/src/search/spider.pl Index: spider.pl =================================================================== RCS file: /home/cvs/modperl-docs/src/search/spider.pl,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- spider.pl 31 Jan 2002 01:51:50 -0000 1.2 +++ spider.pl 3 Mar 2002 11:27:22 -0000 1.3 @@ -2,7 +2,7 @@ use strict; -# $Id: spider.pl,v 1.2 2002/01/31 01:51:50 stas Exp $ +# $Id: spider.pl,v 1.3 2002/03/03 11:27:22 stas Exp $ # # "prog" document source for spidering web servers # @@ -23,7 +23,7 @@ use HTML::Tagset; use vars '$VERSION'; -$VERSION = sprintf '%d.%02d', q$Revision: 1.2 $ =~ /: (\d+)\.(\d+)/; +$VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/; use vars '$bit'; use constant DEBUG_ERRORS => $bit = 1; # program errors @@ -36,10 +36,13 @@ use constant MAX_SIZE => 5_000_000; # Max size of document to fetch +use constant MAX_WAIT_TIME => 30; # request time. #Can't locate object method "host" via package "URI::mailto" at ../prog-bin/spider.pl line 473. #sub URI::mailto::host { return '' }; + +# This is not the right way to do this. sub UNIVERSAL::host { '' }; sub UNIVERSAL::port { '' }; sub UNIVERSAL::host_port { '' }; @@ -62,7 +65,7 @@ print STDERR "$0: Reading parameters from '$config'\n"; my $abort; - local $SIG{HUP} = sub { $abort++ }; + local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~ /Win32/i; my %visited; # global -- I suppose would be smarter to localize it per server. @@ -74,8 +77,9 @@ die "You must specify 'base_url' in your spider config settings\n"; } - for (ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} : $s->{base_url} ) { - $s->{base_url} = $_; + my @urls = ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} :( $s->{base_url}); + for my $url ( @urls ) { + $s->{base_url} = $url; process_server( $s ); } } @@ -100,12 +104,18 @@ # set defaults $server->{debug} ||= 0; - $server->{debug} = 0 unless $server->{debug} =~ /^\d+$/; + die "debug parameter '$server->{debug}' must be a number\n" unless $server->{debug} =~ /^\d+$/; $server->{max_size} ||= MAX_SIZE; die "max_size parameter '$server->{max_size}' must be a number\n" unless $server->{max_size} =~ /^\d+$/; + + $server->{max_wait_time} ||= MAX_WAIT_TIME; + die "max_wait_time parameter '$server->{max_wait_time}' must be a number\n" if $server->{max_wait_time} !~ /^\d+/; + + + $server->{link_tags} = ['a'] unless ref $server->{link_tags} eq 'ARRAY'; $server->{link_tags_lookup} = { map { lc, 1 } @{$server->{link_tags}} }; @@ -139,14 +149,32 @@ my $uri = URI->new( $server->{base_url} ); $uri->fragment(undef); + if ( $uri->userinfo ) { + die "Can't specify parameter 'credentials' because base_url defines them\n" + if $server->{credentials}; + $server->{credentials} = $uri->userinfo; + $uri->userinfo( undef ); + } + + print STDERR "\n -- Starting to spider: $uri --\n" if $server->{debug}; # set the starting server name (including port) -- will only spider on server:port - $server->{authority} = $uri->authority; - $server->{same} = [ $uri->authority ]; + + # All URLs will end up with this host:port + $server->{authority} = $uri->canonical->authority; + + # All URLs must match this scheme ( Jan 22, 2002 - spot by Darryl Friesen ) + $server->{scheme} = $uri->scheme; + + + + # Now, set the OK host:port names + $server->{same} = [ $uri->canonical->authority ]; + push @{$server->{same}}, @{$server->{same_hosts}} if ref $server->{same_hosts}; $server->{same_host_lookup} = { map { $_, 1 } @{$server->{same}} }; @@ -169,8 +197,9 @@ my $ua; + if ( $server->{ignore_robots_file} ) { - $ua = LWP::UserAgent->new( ); + $ua = LWP::UserAgent->new; return unless $ua; $ua->agent( $server->{agent} ); $ua->from( $server->{email} ); @@ -181,6 +210,9 @@ $ua->delay( $server->{delay_min} || 0.1 ); } + # Set the timeout on the server and using Windows. + $ua->timeout( $server->{max_wait_time} ) if $^O =~ /Win32/i; + $server->{ua} = $ua; # save it for fun. # $ua->parse_head(0); # Don't parse the content @@ -224,6 +256,56 @@ } } + +#----------------------------------------------------------------------- +# Deal with Basic Authen + + + +# Thanks Gisle! +sub get_basic_credentials { + my($uri, $server, $realm ) = @_; + my $netloc = $uri->canonical->host_port; + + my ($user, $password); + + eval { + local $SIG{ALRM} = sub { die "timed out\n" }; + alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i; + + if ( $uri->userinfo ) { + print STDERR "\nSorry: invalid username/password\n"; + $uri->userinfo( undef ); + } + + + print STDERR "Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: "; + $user = <STDIN>; + chomp($user); + die "No Username specified\n" unless length $user; + + alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i; + + print STDERR "Password: "; + system("stty -echo"); + $password = <STDIN>; + system("stty echo"); + print STDERR "\n"; # because we disabled echo + chomp($password); + + alarm( 0 ) unless $^O =~ /Win32/i; + }; + + return if $@; + + return join ':', $user, $password; + + +} + + + + #----------- Non recursive spidering --------------------------- sub spider { @@ -275,9 +357,30 @@ $server->{no_index} = 0; $server->{no_spider} = 0; + + # Set basic auth if defined - use URI specific first, then credentials + if ( my ( $user, $pass ) = split /:/, ( $uri->userinfo || $server->{credentials} || '' ) ) { + $request->authorization_basic( $user, $pass ); + } + + + + my $been_here; my $callback = sub { + # Reset alarm; + alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i; + + + # Cache user/pass + if ( $server->{cur_realm} && $uri->userinfo ) { + my $key = $uri->canonical->host_port . ':' . $server->{cur_realm}; + $server->{auth_cache}{$key} = $uri->userinfo; + } + + $uri->userinfo( undef ) unless $been_here; + die "test_response" if !$been_here++ && !check_user_function( 'test_response', $uri, $server, $_[1], \$_[0] ); @@ -290,12 +393,55 @@ }; - my $response = $ua->simple_request( $request, $callback, 4096 ); + my $response; + + eval { + local $SIG{ALRM} = sub { die "timed out\n" }; + alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i; + $response = $ua->simple_request( $request, $callback, 4096 ); + alarm( 0 ) unless $^O =~ /Win32/i; + }; return if $server->{abort}; + if ( $response && $response->code == 401 && $response->header('WWW-Authenticate') && $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i ) { + my $realm = $1; + + my $user_pass; + + # Do we have a cached user/pass for this realm? + my $key = $uri->canonical->host_port . ':' . $realm; + + if ( $user_pass = $server->{auth_cache}{$key} ) { + + # If we didn't just try it, try again + unless( $uri->userinfo && $user_pass eq $uri->userinfo ) { + $uri->userinfo( $user_pass ); + return process_link( $server, $uri, $parent, $depth ); + } + } + + # otherwise, prompt: + + + if ( $user_pass = get_basic_credentials( $uri, $server, $realm ) ) { + $uri->userinfo( $user_pass ); + + $server->{cur_realm} = $realm; # save so we can cache + my $links = process_link( $server, $uri, $parent, $depth ); + delete $server->{cur_realm}; + + return $links; + } + print STDERR "Skipping $uri\n"; + } + + $uri->userinfo( undef ); + + + # Log the response if ( ( $server->{debug} & DEBUG_URL ) || ( $server->{debug} & DEBUG_FAILED && !$response->is_success) ) { @@ -322,6 +468,7 @@ return; } + $response->request->uri->userinfo( undef ); # skip excluded by robots.txt @@ -501,13 +648,15 @@ # which tags to use ( not reported in debug ) - print STDERR " ?? Looking at extracted tag '$tag'\n" if $server->{debug} & DEBUG_LINKS; + my $attr = join ' ', map { qq[$_="$attr{$_}"] } keys %attr; + + print STDERR "\nLooking at extracted tag '<$tag $attr>'\n" if $server->{debug} & DEBUG_LINKS; unless ( $server->{link_tags_lookup}{$tag} ) { # each tag is reported only once per page print STDERR - " ?? <$tag> skipped because not one of (", + " <$tag> skipped because not one of (", join( ',', @{$server->{link_tags}} ), ")\n" if $server->{debug} & DEBUG_LINKS && !$skipped_tags{$tag}++; @@ -539,18 +688,14 @@ next unless check_link( $u, $server, $base, $tag, $attribute ); push @links, $u; - print STDERR qq[ ++ <$tag $attribute="$u"> Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS; + print STDERR qq[ $attribute="$u" Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS; $found++; } } if ( !$found && $server->{debug} & DEBUG_LINKS ) { - my $s = "<$tag"; - $s .= ' ' . qq[$_="$attr{$_}"] for sort keys %attr; - $s .= '>'; - - print STDERR " ?? tag $s did not include any links to follow\n"; + print STDERR " tag did not include any links to follow or is a duplicate\n"; } } @@ -599,15 +744,15 @@ # Here we make sure we are looking at a link pointing to the correct (or equivalent) host - unless ( $server->{same_host_lookup}{$u->authority} ) { + unless ( $server->{scheme} eq $u->scheme && $server->{same_host_lookup}{$u->canonical->authority} ) { - print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different authority (server:port)\n] if $server->{debug} & DEBUG_LINKS; + print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different host\n] if $server->{debug} & DEBUG_LINKS; $server->{counts}{'Off-site links'}++; validate_link( $server, $u, $base ) if $server->{validate_links}; return; } - $u->authority( $server->{authority} ); # Force all the same host name + $u->host_port( $server->{authority} ); # Force all the same host name # Allow rejection of this URL by user function @@ -661,10 +806,10 @@ my $request = HTTP::Request->new('HEAD', $uri->canonical ); eval { - $SIG{ALRM} = sub { die "timed out\n" }; - alarm 5; + local $SIG{ALRM} = sub { die "timed out\n" }; + alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i; $response = $ua->simple_request( $request ); - alarm 0; + alarm( 0 ) unless $^O =~ /Win32/i; }; if ( $@ ) { @@ -729,7 +874,6 @@ } sub default_urls { - die "$0: Must list URLs when using 'default'\n" unless @ARGV; my $validate = 0; if ( $ARGV[0] eq 'validate' ) { @@ -737,6 +881,9 @@ $validate = 1; } + die "$0: Must list URLs when using 'default'\n" unless @ARGV; + + my @content_types = qw{ text/html text/plain }; return map { @@ -786,9 +933,18 @@ }, ); - begin indexing: +Begin indexing: + swish-e -S prog -c swish.config +Note: When running on some versions of Windows (e.g. Win ME and Win 98 SE) +you may need to index using the command: + + perl spider.pl | swish-e -S prog -c swish.conf -i stdin + +This pipes the output of the spider directly into swish. + + =head1 DESCRIPTION This is a swish-e "prog" document source program for spidering @@ -1013,6 +1169,19 @@ base_url => [qw! http://swish-e.org/ http://othersite.org/other/index.html !], +You may specify a username and password: + + base_url => 'http://user:[EMAIL PROTECTED]/index.html', + +but you may find that to be a security issue. If a URL is protected by Basic Authentication +you will be prompted for a username and password. This might be a slighly safer way to go. + +The parameter C<max_wait_time> controls how long to wait for user entry before skipping the +current URL. + +See also C<credentials> below. + + =item same_hosts This optional key sets equivalent B<authority> name(s) for the site you are spidering. @@ -1034,9 +1203,9 @@ http://www.mysite.edu/path/to/file.html -Note: This should probably be called B<same_authority> because it compares the URI C<authority> +Note: This should probably be called B<same_host_port> because it compares the URI C<host:port> against the list of host names in C<same_hosts>. So, if you specify a port name in you will -probably want to specify the port name in the the list of hosts in C<same_hosts>: +want to specify the port name in the the list of hosts in C<same_hosts>: my %serverA = ( base_url => 'http://sunsite.berkeley.edu:4444/', @@ -1076,6 +1245,16 @@ but in general you will probably want it much smaller. But, check with the webmaster before using too small a number. +=item max_wait_time + +This setting is the number of seconds to wait for data to be returned from +the request. Data is returned in chunks to the spider, and the timer is reset each time +a new chunk is reported. Therefore, documents (requests) that take longer than this setting +should not be aborted as long as some data is received every max_wait_time seconds. +The default it 30 seconds. + +NOTE: This option has no effect on Windows. + =item max_time This optional key will set the max minutes to spider. Spidering @@ -1204,6 +1383,19 @@ Just a hack. If you set this true the spider will do HEAD requests all links (e.g. off-site links), just to make sure that all your links work. +=item credentials + +You may specify a username and password to be used automatically when spidering: + + credentials => 'username:password', + +A username and password supplied in a URL will override this setting. + +=item credential_timeout + +Sets the number of seconds to wait for user input when prompted for a username or password. +The default is 30 seconds. + =back =head1 CALLBACK FUNCTIONS @@ -1445,7 +1637,7 @@ files to index only the document titles. As shown above, you can turn this feature on for specific documents by setting a flag in -the server hash passed into the C<test_response> or C<filter_contents> subroutines. +the server hash passed into the C<test_response> or C<filter_content> subroutines. For example, in your configuration file you might have the C<test_response> callback set as: @@ -1466,7 +1658,7 @@ HTML I<and> a title is found in the html document. Note: In most cases you probably would not want to send a large binary file to swish, just -to be ignored. Therefore, it would be smart to use a C<filter_contents> callback routine to +to be ignored. Therefore, it would be smart to use a C<filter_content> callback routine to replace the contents with single character (you cannot use the empty string at this time). A similar flag may be set to prevent indexing a document at all, but still allow spidering. 1.3 +978 -314 modperl-docs/src/search/swish.cgi Index: swish.cgi =================================================================== RCS file: /home/cvs/modperl-docs/src/search/swish.cgi,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- swish.cgi 4 Feb 2002 09:19:39 -0000 1.2 +++ swish.cgi 3 Mar 2002 11:27:22 -0000 1.3 @@ -2,17 +2,20 @@ package SwishSearch; use strict; -use lib qw( modules ); ### This must be adjusted! +use lib qw( modules ); ### This may need to be adjusted! + ### It should point to the location of the + ### associated script modules directory -#################################################################################### + +################################################################################### # # If this text is displayed on your browser then your web server # is not configured to run .cgi programs. Contact your web server administrator. # # To display documentation for this program type "perldoc swish.cgi" # -# swish.cgi $Revision: 1.2 $ Copyright (C) 2001 Bill Moseley [EMAIL PROTECTED] +# swish.cgi $Revision: 1.3 $ Copyright (C) 2001 Bill Moseley [EMAIL PROTECTED] # Example CGI program for searching with SWISH-E # # This example program will only run under an OS that supports fork(). @@ -31,14 +34,13 @@ # # The above lines must remain at the top of this program # -# $Id: swish.cgi,v 1.2 2002/02/04 09:19:39 stas Exp $ +# $Id: swish.cgi,v 1.3 2002/03/03 11:27:22 stas Exp $ # #################################################################################### # This is written this way so the script can be used as a CGI script or a mod_perl # module without any code changes. - # use CGI (); # might not be needed if using Apache::Request #================================================================================= @@ -59,50 +61,11 @@ } -#================================================================================= -# mod_perl entry point -# -# As an example, you might use a PerlSetVar to point to paths to different -# config files, and then cache the different configurations by path. -# -#================================================================================= - -my %cached_configs; - -sub handler { - my $r = shift; - - if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) { - - # Already cached? - if ( $cached_configs{ $config_path } ) { - process_request( $cached_configs{ $config_path } ); - return Apache::Constants::OK(); - } - - # Else, load config - my $config = default_config(); - $config->{config_file} = $config_path; - - # Merge with disk config file. - $cached_configs{ $config_path } = merge_read_config( $config ); - - process_request( $cached_configs{ $config_path } ); - return Apache::Constants::OK(); - } - - - # Otherwise, use hard-coded config - process_request( default_config() ); - - return Apache::Constants::OK(); - -} - #================================================================================== -# This sets the default configuration +# This sets the default configuration parameters +# # Any configuration read from disk is merged with these settings. # # Only a few settings are actually required. Some reasonable defaults are used @@ -140,18 +103,13 @@ sub default_config { - # make the search of the swish-e executable more flexible. First - # search in the PATH, then in the current dir. - my $exec = `which swish-e`; -#warn "found exec: $exec"; - chomp $exec; - $exec ||= './swish-e'; - die "Cannot find swish-e" unless -x $exec; + ##### Configuration Parameters ######### #---- This lists all the options, with many commented out --- # By default, this config is used -- see the process_request() call below. + # You should adjust for your site, and how your swish index was created. ##>> @@ -159,11 +117,24 @@ ##>> ##>> Send a small example, without all the comments. - # Items beginning with an "x" or "#" are commented out - + #====================================================================== + # NOTE: Items beginning with an "x" or "#" are commented out + # the "x" form simply renames (hides) that setting. It's used + # to make it easy to disable a mult-line configuation setting. + # + # If you do not understand a setting then best to leave the default. + # + # Please follow the documentation (perldoc swish.cgi) and set up + # a test using the defaults before making changes. It's much easier + # to modify a working example than to try to get a modified example to work ;) + # + # Again, this is a Perl hash structure. Commas are important. + #====================================================================== + return { - title => 'Search our site', # Title of your choice. - swish_binary => $exec, # Location of swish-e binary + title => 'Search our site', # Title of your choice. Displays on the search page + swish_binary => './swish-e', # Location of swish-e binary + # By default, this script tries to read a config file. You should probably # comment this out if not used save a disk stat @@ -175,7 +146,7 @@ # If you have more than one index to search then specify an array # reference. e.g. swish_index =>[ qw/ index1 index2 index3 /], - swish_index => 'index.swish-e', # Location of your index file + swish_index => 'index.swish-e', # Location of your index file # See "select_indexes" below for how to # select more than one index. @@ -188,6 +159,8 @@ # But you can specify any PropertyName defined in your document. # By default, swish will return the pathname for documents that do not # have a title. + # In other words, this is used for the text of the links of the search results. + # <a href="prepend_path/swishdocpath">title_property</a> title_property => 'swishtitle', @@ -283,6 +256,9 @@ timeout => 10, # limit time used by swish when fetching results - DoS protection. + max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40) + # You might want to set swish-e's limit higher, and use this to get a + # somewhat more friendly message. # These settings will use some crude highlighting code to highlight search terms in the @@ -337,7 +313,7 @@ #swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ], Xselect_indexes => { - #method => 'radio_group', # pico radio_group, popup_menu, or checkbox_group + #method => 'radio_group', # pick radio_group, popup_menu, or checkbox_group method => 'checkbox_group', #method => 'popup_menu', columns => 3, @@ -357,7 +333,7 @@ Xselect_by_meta => { - #method => 'radio_group', # pico radio_group, popup_menu, or checkbox_group + #method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group method => 'checkbox_group', #method => 'popup_menu', columns => 3, @@ -409,12 +385,42 @@ }, + + # The "on_intranet" setting is just a flag that can be used to say you do + # not have an external internet connection. It's here because the default + # page generation includes links to images on swish-e.or and on www.w3.org. + # If this is set to one then those images will not be shown. + # (This only effects the default ouput module TemplateDefault) + + on_intranet => 0, + + + + # Here you can hard-code debugging options. The will help you find + # where you made your mistake ;) + # Using all at once will generate a lot of messages to STDERR + # Please see the documentation before using these. + # Typically, you will set these from the command line instead of in the configuration. + + # debug_options => 'basic, command, headers, output, summary, dump', + + + # This defines the package object for reading CGI parameters # Defaults to CGI. Might be useful with mod_perl. # request_package => 'CGI', # request_package => 'Apache::Request', + + # Minor adjustment to page display. The page navigation normally looks like: + # Page: 1 5 6 7 8 9 24 + # where the first page and last page are always displayed. These can be disabled by + # by setting to true values ( 1 ) + + no_first_page_navigation => 0, + no_last_page_navigation => 0, + @@ -458,6 +464,52 @@ } +#^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +#======================================================================================== + + + +#================================================================================= +# mod_perl entry point +# +# As an example, you might use a PerlSetVar to point to paths to different +# config files, and then cache the different configurations by path. +# +#================================================================================= + +my %cached_configs; + +sub handler { + my $r = shift; + + if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) { + + # Already cached? + if ( $cached_configs{ $config_path } ) { + process_request( $cached_configs{ $config_path } ); + return Apache::Constants::OK(); + } + + # Else, load config + my $config = default_config(); + $config->{config_file} = $config_path; + + # Merge with disk config file. + $cached_configs{ $config_path } = merge_read_config( $config ); + + process_request( $cached_configs{ $config_path } ); + return Apache::Constants::OK(); + } + + + # Otherwise, use hard-coded config + process_request( default_config() ); + + return Apache::Constants::OK(); + +} + + #============================================================================ # Read config settings from disk, and merge # Note, all errors are ignored since by default this script looks for a @@ -467,16 +519,82 @@ sub merge_read_config { my $config = shift; + set_default_debug_flags(); + + set_debug($config); # get from config or from %ENV + return $config unless $config->{config_file}; my $return = do $config->{config_file}; return $config unless ref $return eq 'HASH'; + if ( $config->{debug} || $return->{debug} ) { + require Data::Dumper; + print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n", + Data::Dumper::Dumper($return), + "-------------------------\n"; + } + + set_debug( $return ); + + # Merge settings return { %$config, %$return }; } +#-------------------------------------------------------------------------------------------------- +sub set_default_debug_flags { + # Debug flags defined + + $SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish + $SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish + $SwishSearch::DEBUG_HEADERS = 4; # Swish output headers + $SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers + $SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed + $SwishSearch::DEBUG_DUMP_DATA = 32; # dump data that is sent to templating modules +} + + + + +#--------------------------------------------------------------------------------------------------- +sub set_debug { + my $conf = shift; + + unless ( $ENV{SWISH_DEBUG} ||$conf->{debug_options} ) { + $conf->{debug} = 0; + return; + } + + my %debug = ( + basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'], + command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'], + headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'], + output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'], + summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'], + dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'], + ); + + + $conf->{debug} = 1; + + for ( split /\s*,\s*/, $ENV{SWISH_DEBUG} ) { + if ( exists $debug{ lc $_ } ) { + $conf->{debug} |= $debug{ lc $_ }->[0]; + next; + } + + print STDERR "Unknown debug option '$_'. Must be one of:\n", + join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug), + "\n\n"; + exit; + } + + print STDERR "Debug level set to: $conf->{debug}\n"; +} + + #============================================================================ # # This is the main entry point, where a config hash is passed in. @@ -491,17 +609,64 @@ $request_package =~ s[::][/]g; require "$request_package.pm"; + my $request_object = $conf->{request_package} ? $conf->{request_package}->new : CGI->new; + + if ( $conf->{debug} ) { + print STDERR 'Enter a query [all]: '; + my $query = <STDIN>; + $query =~ tr/\r//d; + chomp $query; + unless ( $query ) { + print STDERR "Using 'not asdfghjklzxcv' to match all records\n"; + $query = 'not asdfghjklzxcv'; + } + + $request_object->param('query', $query ); + + print STDERR 'Enter max results to display [1]: '; + my $max = <STDIN>; + chomp $max; + $max = 1 unless $max && $max =~/^\d+$/; + + $conf->{page_size} = $max; + } + + # create search object my $search = SwishQuery->new( config => $conf, - request => ($conf->{request_package} ? $conf->{request_package}->new : CGI->new), + request => $request_object, ); # run the query my $results = $search->run_query; # currently, results is the just the $search object + if ( $conf->{debug} ) { + if ( $conf->{debug} & $SwishSearch::DEBUG_DUMP_DATA ) { + require Data::Dumper; + print STDERR "\n------------- Results structure passed to template ------------\n", + Data::Dumper::Dumper( $results ), + "--------------------------\n"; + } elsif ( $conf->{debug} & $SwishSearch::DEBUG_SUMMARY ) { + print STDERR "\n------------- Results Summary ------------\n"; + if ( $results->{hits} ) { + require Data::Dumper; + print STDERR "Showing $results->{navigation}{showing} of $results->{navigation}{hits}\n", + Data::Dumper::Dumper( $results->{_results} ); + } else { + print STDERR "** NO RESULTS **\n"; + } + + print STDERR "--------------------------\n"; + } else { + print STDERR ( ($results->{hits} ? "Found $results->{hits} results\n" : "Failed to find any results\n" . $results->errstr . "\n" ),"\n" ); + } + } + + + my $template = $conf->{template} || { package => 'TemplateDefault' }; my $package = $template->{package}; @@ -509,7 +674,21 @@ my $file = "$package.pm"; $file =~ s[::][/]g; - require $file; + eval { require $file }; + if ( $@ ) { + warn "$0 [EMAIL PROTECTED]"; + print <<EOF; +Content-Type: text/html + +<html> +<head><title>Software Error</title></head> +<body><h2>Software Error<h2><p>Please check error log</p></body> +</html> +EOF + + exit; +} + $package->show_template( $template, $results ); } @@ -522,6 +701,10 @@ #================================================================================================== use Carp; +# Or use this instead -- PLEASE see perldoc CGI::Carp for details +# <opinion>CGI::Carp doesn't help that much</opinion> +#use CGI::Carp; # qw(fatalsToBrowser); + #-------------------------------------------------------------------------------- # new() doesn't do much, just create the object @@ -626,7 +809,6 @@ my $conf = $self->{config}; - # Sets the query string, and any -L limits. return $self unless $self->build_query; @@ -656,24 +838,20 @@ - # Trap the call - not portable. - - my $timeout = $self->config('timeout'); - - if ( $timeout ) { - eval { - local $SIG{ALRM} = sub { die "Timed out\n" }; - alarm ( $self->config('timeout') || 5 ); - $self->run_swish; - alarm 0; - }; + my $timeout = $self->config('timeout') || 0; - if ( $@ ) { - $self->errstr( $@ ); - return $self; - } - } else { + eval { + local $SIG{ALRM} = sub { die "Timed out\n" }; + alarm $timeout if $timeout && $^O !~ /Win32/i; $self->run_swish; + alarm 0 unless $^O =~ /Win32/i; + waitpid $self->{pid}, 0 if $self->{pid}; # for IPC::Open2 + }; + + if ( $@ ) { + warn "$0 $@"; # if $conf->{debug}; + $self->errstr( "Service currently unavailable" ); + return $self; } @@ -764,7 +942,9 @@ $self->errstr('Please enter a query string') if $q->param('submit'); return; } - if ( length( $query ) > 100 ) { + + + if ( length( $query ) > $self->{config}{max_query_length} ) { $self->errstr('Please enter a shorter query'); return; } @@ -871,9 +1051,13 @@ eval { require DateRanges }; if ( $@ ) { - $self->errstr( $@ ); + print STDERR "\n------ Can't use DateRanges feature ------------\n", + "\nScript will run, but you can't use the date range feature\n", + $@, + "\n--------------\n" if $conf->{debug}; + delete $conf->{date_ranges}; - return; + return 1; } my $q = $self->{q}; @@ -931,15 +1115,17 @@ # Now set sort option - if a valid option submitted (or you could let swish-e return the error). my %sorts = map { $_, 1 } @$sorts_array; - if ( $q->param('sort') && $sorts{ $q->param('sort') } ) { + my $sortby = $q->param('sort') || 'swishrank'; + + if ( $sortby && $sorts{ $sortby } ) { - my $direction = $q->param('sort') eq 'swishrank' + my $direction = $sortby eq 'swishrank' ? $q->param('reverse') ? 'asc' : 'desc' : $q->param('reverse') ? 'desc' : 'asc'; - $self->swish_command( '-s', $q->param('sort'), $direction ); + $self->swish_command( '-s', $sortby, $direction ); - if ( $conf->{secondary_sort} && $q->param('sort') ne $conf->{secondary_sort}[0] ) { + if ( $conf->{secondary_sort} && $sortby ne $conf->{secondary_sort}[0] ) { $self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} ); } @@ -1017,8 +1203,8 @@ } @pages = $current_page..$current_page + $max_pages - 1; - unshift @pages, 0 if $current_page; - push @pages, $pages unless $current_page + $max_pages - 1 == $pages; + unshift @pages, 0 if $current_page && !$self->{config}{no_first_page_navigation}; + push @pages, $pages unless $current_page + $max_pages - 1 == $pages || $self->{config}{no_last_page_navigation} } @@ -1080,7 +1266,6 @@ # or possibly a scalar with an error message. # -use Symbol; sub run_swish { @@ -1091,8 +1276,6 @@ my $conf = $self->{config}; my $q = $self->{q}; - - my @properties; my %seen; @@ -1116,15 +1299,10 @@ $self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' ); $self->swish_command( -H => 9 ); - # Run swish - my $fh = gensym; - my $pid = open( $fh, '-|' ); + my $fh = $^O =~ /Win32/i + ? windows_fork( $conf, $self ) + : real_fork( $conf, $self ); - die "Failed to fork: $!\n" unless defined $pid; - - if ( !$pid ) { # in child - exec $self->{prog}, $self->swish_command or die "Failed to exec '$self->{prog}' Error:$!"; - } $self->{COMMAND} = join ' ', $self->{prog}, $self->swish_command; @@ -1142,13 +1320,20 @@ # Loop through values returned from swish. my %stops_removed; - + + my $unknown_output = ''; + + while (<$fh>) { chomp; + tr/\r//d; # This will not work correctly with multiple indexes when different values are used. if ( /^# ([^:]+):\s+(.+)$/ ) { + + print STDERR "$_\n" if $conf->{debug} & $SwishSearch::DEBUG_HEADERS; + my $h = lc $1; my $value = $2; $self->{_headers}{$h} = $value; @@ -1156,12 +1341,18 @@ push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++; next; + } elsif ( $conf->{debug} & $SwishSearch::DEBUG_OUTPUT ) { + print STDERR "$_\n"; } + - # return errors as text + # return swish errors as a mesage to the script $self->errstr($1), return if /^err:\s*(.+)/; + # Or, if you want to log the errors and just say "Service Unavailable" use this: + #die "$1\n" if /^err:\s*(.+)/; + # Found a result if ( /^\d/ ) { @@ -1189,7 +1380,8 @@ eval { require "$package.pm" }; if ( $@ ) { - $self->errstr( $@ ); + $self->errstr( "Failed to load Highlighting Module - check error log" ); + warn "$0: $@"; $highlight = ''; next; } else { @@ -1216,19 +1408,93 @@ $h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>'; } } + + next; + } elsif ( /^\.$/ ) { + last; + + } else { + next if /^#/; } - # Might check for "\n." for end of results. + $unknown_output .= "'$_'\n"; + + } + die "Swish returned unknown output: $unknown_output\n" if $unknown_output; + $self->{hits} = @results; $self->{_results} = [EMAIL PROTECTED] if @results; } +#================================================================== +# Run swish-e by forking +# + +use Symbol; + +sub real_fork { + my ( $conf, $self ) = @_; + + + # Run swish + my $fh = gensym; + my $pid = open( $fh, '-|' ); + + die "Failed to fork: $!\n" unless defined $pid; + + + + if ( !$pid ) { # in child + if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) { + print STDERR "---- Running swish with the following command and parameters ----\n"; + print STDERR join( " \\\n", map { /[^\/.\-\w\d]/ ? qq['$_'] : $_ } $self->{prog}, $self->swish_command ); + print STDERR "\n-----------------------------------------------\n"; + } + + + unless ( exec $self->{prog}, $self->swish_command ) { + warn "Child process Failed to exec '$self->{prog}' Error: $!"; + print "Failed to exec Swish"; # send this message to parent. + exit; + } + } + + return $fh; +} + + +#===================================================================================== +# Windows work around +# from perldoc perlfok -- na, that doesn't work. Try IPC::Open2 +# +sub windows_fork { + my ( $conf, $self ) = @_; + + if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) { + print STDERR "---- Running swish with the following command and parameters ----\n"; + print STDERR join( ' ', map { /[^.\-\w\d]/ ? qq["$_"] : $_ } map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command ); + print STDERR "\n-----------------------------------------------\n"; + } + + + require IPC::Open2; + my ( $rdrfh, $wtrfh ); + + # Ok, I'll say it. Windows sucks. + my @command = map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command; + my $pid = IPC::Open2::open2($rdrfh, $wtrfh, @command ); + + + $self->{pid} = $pid; + + return $rdrfh; +} #===================================================================================== # This method parses out the query from the "Parsed words" returned by swish @@ -1347,138 +1613,309 @@ =head1 DESCRIPTION -C<swish.cgi> is an example CGI script for searching with the SWISH-E search engine version 2.1-dev and above. +C<swish.cgi> is a CGI script for searching with the SWISH-E search engine version 2.1-dev and above. It returns results a page at a time, with matching words from the source document highlighted, showing a few words of content on either side of the highlighted word. -The standard configuration should work with most swish index files. Customization of the parameters will be +The script is highly configurable; you can search multiple (or selectable) indexes, limit searches to +part of the index, allow sorting by a number of different properties, limit results to a date range, and so on. + +The standard configuration (i.e. not using a config file) should work with most swish index files. +Customization of the parameters will be needed if you are indexing special meta data and want to search and/or display the meta data. The configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf by default). +You are strongly encouraged to get the default configuration working before making changes. Most problems +using this script are the result of configuration modifications. + The script is modular in design. Both the highlighting code and output generation is handled by modules, which are included in the F<example/modules> directory. This allows for easy customization of the output without changing the main CGI script. A module exists to generate standard HTML output. There's also modules and -template examples to use with the popular templating systems HTML::Template and Template-Toolkit. This allows +template examples to use with the popular Perl templating systems HTML::Template and Template-Toolkit. This allows you to tightly integrate this script with the look of an existing template-driven web site. +HTML::Template and Template-Toolkit are available from the CPAN (http://search.cpan.org). This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than running as a CGI script. -Due to the forking nature of this program and its use of signals, -this script probably will not run under Windows without some modifications. -There's plan to change this soon. +Please read the rest of the documentation. There's a C<DEBUGGING> section, and a C<FAQ> section. + +This script should work on Windows, but security may be an issue. + +=head1 REQUIREMENTS + +You should be running a reasonably current version of Perl. 5.00503 or above is recommended (anything older +will not be supported). + +If you wish to use the date range feature you will need to install the Date::Calc module. This is available +from http://search.cpan.org. =head1 INSTALLATION -Installing a CGI application is dependent on your specific web server's configuration. -For this discussion we will assume you are using Apache in a typical configuration. For example, -a common location for the DocumentRoot is C</usr/local/apache/htdocs>. If you are installing this -on your shell account, your DocumentRoot might be C<~yourname/public_html>. +Here's an example installation session. Please get a simple installation working before modifying the +configuration file. Most problems reported for using this script have been due to improper configuration. -For the sake of this example we will assume the following: +The script's default settings are setup for initial testing. By default the settings expect to find +most files and the swish-e binary in the same directory as the script. - /usr/local/apache/htdocs - Document root - /usr/local/apache/cgi-bin - CGI directory +For I<security> reasons, once you have tested the script you will want to change settings to limit access +to some of these files by the web server +(either by moving them out of web space, or using access control such as F<.htaccess>). +An example of using F<.htaccess> on Apache is given below. -=head2 Move the files to their locations +It's expected that you have already unpacked the swish-e distribution +and built the swish-e binary (if using a source distribution). + +Below is a (unix) session where we create a directory, move required files into this directory, adjust +permissions, index some documents, and symlink into the web server. =over 4 -=item Copy the swish.cgi file to your CGI directory +=item 1 Move required files into their own directory. -Most web servers have a directory where CGI programs are kept. -Copy the C<swish.cgi> perl script into that directory if this is the case on your -server. You will need to provide read -and execute permisssions to the file. Exactly what permissions are needed again depends on -your specific configuration. For example, under Unix: +This assumes that swish-e was unpacked and build in the ~/swish-e directory. - chmod 0755 swish.cgi + ~ >mkdir swishdir + ~ >cd swishdir + ~/swishdir >cp ~/swish-e/example/swish.cgi . + ~/swishdir >cp -rp ~/swish-e/example/modules . + ~/swishdir >cp ~/swish-e/src/swish-e . + ~/swishdir >chmod 755 swish.cgi + ~/swishdir >chmod 644 modules/* -This gives the file owner (that's you) write access, and everyone read and execute access. -Note that you are not required to use a cgi-bin directory with Apache. You may place the -CGI script in any directory accessible via the web server and -enable it as a CGI script with something like the following -(place either in httpd.conf or in .htaccess): +=item 2 Create an index - <Files swish.cgi> - Allow from all - SetHandler cgi-script - Options +ExecCGI - </Files> +This step you will create a simple configuration file. In this example the Apache documentation +is indexed. Last we run a simple query to test swish. -Using this method you don't even need to use the C<.cgi> extension. For example, rename -the script to "search" and then use that in the C<Files> directive. Take to your web -administrator for further information. - -=item Copy the modules directory - -Copying the modules directory is optional, but the script needs to find additional modules so you will -need to edit the script to point to the modules directory. Unlike CPAN modules that need to -be uncompressed, built, and installed, all you need to do is make sure the modules are some place where -the web server can read them. You may decide to leave them where you uncompressed the swish-e distribution, -or you may wish to move them to your perl library. + ~/swishdir >cat swish.conf + IndexDir /usr/local/apache/htdocs + IndexOnly .html .htm + DefaultContents HTML + StoreDescription HTML <body> 200000 + MetaNames swishdocpath swishtitle -=head1 CONFIGURATION + ~/swishdir >./swish-e -c swish.conf + Indexing Data Source: "File-System" + Indexing "/usr/local/apache/htdocs" + Removing very common words... + no words removed. + Writing main index... + Sorting words ... + Sorting 7005 words alphabetically + Writing header ... + Writing index entries ... + Writing word text: Complete + Writing word hash: Complete + Writing word data: Complete + 7005 unique words indexed. + 5 properties sorted. + 124 files indexed. 1485844 total bytes. 171704 total words. + Elapsed time: 00:00:02 CPU time: 00:00:02 + Indexing done! + +Now, verify that the index can be searched: + + ~/swishdir >./swish-e -w install -m 1 + # SWISH format: 2.1-dev-25 + # Search words: install + # Number of hits: 14 + # Search time: 0.001 seconds + # Run time: 0.040 seconds + 1000 /usr/local/apache/htdocs/manual/dso.html "Apache 1.3 Dynamic Shared Object (DSO) support" 17341 + . + +Let's see what files we have in our directory now: + + ~/swishdir >ls -1 -F + index.swish-e + index.swish-e.prop + modules/ + swish-e* + swish.cgi* + swish.conf -=head2 Configure the swish.cgi program +=item 3 Test the CGI script -Use a text editor and open the C<swish.cgi> program. +This is a simple step, but often overlooked. You should test from the command line instead of jumping +ahead and testing with the web server. See the C<DEBUGGING> section below for more information. -=over 4 + ~/swishdir >./swish.cgi | head + Content-Type: text/html; charset=ISO-8859-1 + + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + <html> + <head> + <title> + Search our site + </title> + </head> + <body> + +The above shows that the script can be run directly, and generates a correct HTTP header and HTML. -=item 1 Check the C<shebang> line +If you run the above and see something like this: -The first line of the program must point to the location of your perl program. Typical -examples are: + ~/swishdir >./swish.cgi + bash: ./swish.cgi: No such file or directory +then you probably need to edit the script to point to the correct location of your perl program. +Here's one way to find out where perl is located (again, on unix): + + ~/swishdir >which perl + /usr/local/bin/perl + + ~/swishdir >/usr/local/bin/perl -v + This is perl, v5.6.0 built for i586-linux + ... + +Good! We are using a reasonably current version of perl. You should be running +at least perl 5.005 (5.00503 really). You will may have problems otherwise. + +Now that we know perl is at F</usr/local/bin/perl> we can adjust the "shebang" line +in the perl script (e.g. the first line of the script): + + ~/swishdir >pico swish.cgi + (edit the #! line) + ~/swishdir >head -1 swish.cgi #!/usr/local/bin/perl -w - #!/usr/bin/perl -w - #!/opt/perl/bin/perl -w -=item 2 Set the perl library path +=item 4 Test with your web server -The script must find the modules that the script is distributed with. These modules handle -the highlighting of the search terms, and the output generation. Again, where you place the -modules is up to you, and the only requirement is that the web server can access those files. +How you do this is completely dependent on your web server, and you may need to talk to your web +server admin to get this working. Often files with the .cgi extension are automatically set up to +run as CGI scripts, but not always. In other words, this step is really up to you to figure out! -You tell perl the location of the modules with the "use lib" directive. The default for this script is: +First, I create a symlink in Apache's document root to point to my test directory "swishdir". This will work +because I know my Apache server is configured to follow symbolic links. - use lib qw( modules ); + ~/swishdir >su -c 'ln -s /home/bill/swishdir /usr/local/apache/htdocs/swishdir' + Password: ********* -This says to look for the modules in the F<modules> directory of the current directory. +If your account is on an ISP and your web directory is F<~/public_html> the you might just move the entire +directory: -For example, say you want to leave the modules where you unpacked the swish-e distribution. If -you unpacked in your home directory of F</home/yourname/swish-e> then you must add this to the -script: + mv ~/swishdir ~/public_html - use lib qw( /home/yourname/swish-e/example/modules ); +Now, let's make a real HTTP request. I happen to have Apache setup on a local port: - + ~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head -3 + #!/usr/local/bin/perl -w + package SwishSearch; + use strict; -=item 3 Set the configuration parameters +Oh, darn. It looks like Apache is not running the script and instead returning it as a +static page. I need to tell Apache that swish.cgi is a CGI script. -To make things somewhat simple, the configuration parameters are included at the top of the program. -The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should -get you going. +In my case F<.htaccess> comes to the rescue: + + ~/swishdir >cat .htaccess + + # Deny everything by default + Deny From All + + # But allow just CGI script + <files swish.cgi> + Options ExecCGI + Allow From All + SetHandler cgi-script + </files> + +Let's try the request one more time: + + ~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + <html> + <head> + <title> + Search our site + </title> + </head> + <body> + <h2> + <a href="http://swish-e.org"> + +That looks better! Now use your web browser to test. + +Make sure you look at your web server's error log file while testing the script. + +BTW - "GET" is a program included with Perl's LWP library. If you do no have this you might +try something like: + + wget -O - http://localhost:8000/swishdir/swish.cgi | head + +and if nothing else, you can always telnet to the web server and make a basic request. + + ~/swishtest > telnet localhost 8000 + Trying 127.0.0.1... + Connected to localhost. + Escape character is '^]'. + GET /swishtest/swish.cgi http/1.0 + + HTTP/1.1 200 OK + Date: Wed, 13 Feb 2002 20:14:31 GMT + Server: Apache/1.3.20 (Unix) mod_perl/1.25_01 + Connection: close + Content-Type: text/html; charset=ISO-8859-1 + + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + <html> + <head> + <title> + Search our site + </title> + </head> + <body> + +This may seem like a lot of work compared to using a browser, but browsers +are a poor tool for basic CGI debugging. + + +=back + +If you have problems check the C<DEBUGGING> section below. -You will probably need to specify at least the location of the swish-e binary, your index file or files, -and a title. +=head1 CONFIGURATION + +If you want to change the location of the swish-e binary or the index file, use multiple indexes, add additional metanames and properties, +change the default highlighting behavior, etc., you will need to adjust the script's configuration settings. + +Please get a test setup working with the default parameters before making changes to any configuration settings. +Better to debug one thing at a time... + +In general, you will need to adjust the script's settings to match the index file you are searching. For example, +if you are indexing a hypermail list archive you may want to make the script +use metanames/properties of Subject, Author, and, Email address. Or you may wish to provide a way to limit +searches to parts of your index file (e.g. parts of your directory tree). + +To make things somewhat "simple", the configuration parameters are included near the top of the swish.cgi program. +That is the only place that the individual parameters are defined and explained, so you will need to open up +the swish.cgi script in an editor to view the options. Further questions about individual settings should +be referred to the swish-e discussion list. -You have two options for changing the configuration settings from their default: +The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should +get you going. The perl hash structure may seem a bit confusing, but it makes it easy to create nested and complex +parameters. + +You have two options for changing the configuration settings from their default values: you may edit the script directly, or you may use a configuration file. In either case, the configuration settings are a basic perl hash reference. -Using a configuration file is described below. +Using a configuration file is described below, but contains the same hash structure. -The configuration settings might look like: +There are many configuration settings, and some of them are commented out either by using +a "#" symbol, or by simply renaming the configuration directive (e.g. by adding an "x" to the parameter +name). + +A very basic configuration setup might look like: return { title => 'Search the Swish-e list', # Title of your choice. swish_binary => './swish-e', # Location of swish-e binary - swish_index => '../index.swish-e', # Location of your index file + swish_index => 'index.swish-e', # Location of your index file }; Or if searching more than one index: @@ -1486,31 +1923,25 @@ return { title => 'Search the Swish-e list', swish_binary => './swish-e', - swish_index => ['../index.swish-e', '../index2'], + swish_index => ['index.swish-e', 'index2'], }; -Both of these examples return a reference to a perl hash ( C<return {...}> ). Again, this same format is -used either at the top of this program, or in a configuration file. - -The examples above place the swish index file(s) -in the directory above the C<swish.cgi> CGI script. If using the example paths above -of C</usr/local/apache/cgi-bin> for the CGI bin directory, that means that the index file -is in C</usr/local/apache>. That places the index out of web space (e.g. cannot be accessed -via the web server), yet relative to where the C<swish.cgi> script is located. +Both of these examples return a reference to a perl hash ( C<return {...}> ). In the second example, +the multiple index files are set as an array reference. -(If running under mod_perl you will most likely specify absolute paths for your index files.) +Note that in the example above the swish-e binary file is relative to the current directory. +If running under mod_perl you will typically need to use absolute paths. -There's more than one way to do it, of course. -One option is to place the index in the same directory as the <swish.cgi> script, but -then be sure to use your web server's configuration to prohibit access to the index directly. +B<Using A Configuration File> -Another common option is to maintain a separate directory of the all your swish index files. This decision is -up to you. - -As mentioned above, you can either edit this script directly and modify the configuration settings, or +As mentioned above, you can either edit the F<swish.cgi> script directly and modify the configuration settings, or use an external configuration file. The settings in the configuration file are merged with (override) the settings defined in the script. +The advantage of using a configuration script is that you are not editing the swish.cgi script directly, and +downloading a new version won't mean re-editing the cgi script. Also, if running under mod_perl you can use the same +script loaded into Apache to manage many different search pages. + By default, the script will attempt to read from the file F<.swishcgi.conf>. For example, you might only wish to change the title used in the script. Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script: @@ -1521,62 +1952,19 @@ title => 'Search Our Mailing List Archive', }; -Look at the default configuration settings at the top of this program for information on the available settings. - -=item 4 Create your index - -You must index your web site before you can begin to use the C<swish.cgi> script. -Create a configuration file called C<swish.conf> in the directory where you will store -the index file. - -This next example uses the file system to index your web documents. -In general, you will probably wish to I<spider> your web site if your web pages do not -map exactly to your file system, and to only index files available from links on you web -site. - -See B<Spidering> below for more information. - -Example C<swish.conf> file: - - # Define what to index - IndexDir /usr/local/apache/htdocs - IndexOnly .html .htm - - # Tell swish how to parse .html and .html documents - IndexContents HTML .html .htm - # And just in case we have files without an extension - DefaultContents HTML - - # Replace the path name with a URL - ReplaceRules replace /usr/local/apache/htdocs/ http://www.myserver.name/ - - # Allow limiting search to titles and URLs. - MetaNames swishdocpath swishtitle - - # Optionally use stemming for "fuzzy" searches - #UseStemming yes - -Now to index you simply run: - - swish-e -c swish.conf - -The default index file C<index.swish-e> will be placed in the current directory. - -Note that the above swish-e configuration defines two MetaNames "swishdocpath" and "swishtitle". -This allows searching just the document path or the title instead of the document's content. +The settings you use will depend on the index you create with swish. Here's a basic configuration: -Here's an expanded C<swish.cgi> configuration to make use of the above settings used while indexing: - - return { + return { title => 'Search the Apache documentation', swish_binary => './swish-e', swish_index => 'index.swish-e', metanames => [qw/swishdefault swishdocpath swishtitle/], - display_props => [qw/swishlastmodified swishdocsize swishdocpath/], - title_property => 'swishtitle', # Not required, but recommended + display_props => [qw/swishtitle swishlastmodified swishdocsize swishdocpath/], + title_property => 'swishdocpath', + prepend_path => 'http://myhost/apachedocs', name_labels => { - swishdefault => 'Body & Title', + swishdefault => 'Search All', swishtitle => 'Title', swishrank => 'Rank', swishlastmodified => 'Last Modified Date', @@ -1595,56 +1983,365 @@ The parameter "name_labels" is a hash (reference) that is used to give friendly names to the metanames. -Swish-e can store part of all of the contents of the documents as they are indexed, and this -"document description" can be returned with search results. +Here's another example. Say you want to search either (or both) the Apache 1.3 documentation or the +Apache 2.0 documentation: + + return { + title => 'Search the Apache Documentation', + date_ranges => 0, + swish_index => [ qw/ index.apache index.apache2 / ], + select_indexes => { + method => 'checkbox_group', + labels => [ '1.3.23 docs', '2.0 docs' ], # Must match up one-to-one to swish_index + description => 'Select: ', + }, + + }; + +Now you can select either or both sets of documentation while searching. + + +Please refer to the default configuration settings near the top of the script for details on +the available settings. + +=head1 DEBUGGING + +Most problems with using this script have been a result of improper configuration. Please +get the script working with default settings before adjusting the configuration settings. + +The key to debugging CGI scripts is to run them from the command line, not with a browser. + +First, make sure the program compiles correctly: + + > perl -c swish.cgi + swish.cgi syntax OK + +Next, simply try running the program: + + > ./swish.cgi | head + Content-Type: text/html; charset=ISO-8859-1 + + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> + <html> + <head> + <title> + Search our site + </title> + </head> + <body> + +Now, you know that the program compiles and will run from the command line. +Next, try accessing the script from a web browser. + +If you see the contents of the CGI script instead of its output then your web server is +not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler, +and Options. + +If an error is reported (such as Internal Server Error or Forbidden) +you need to locate your web server's error_log file +and carefully read what the problem is. Contact your web administrator for help. + +If you don't have access to the web server's error_log file, you can modify the script to report +errors to the browser screen. Open the script and search for "CGI::Carp". (Author's suggestion is +to debug from the command line -- adding the browser and web server into the equation only complicates +debugging.) + +The script does offer some basic debugging options that allow debugging from the command line. +The debugging options are enabled by setting +an environment variable "SWISH_DEBUG". How that is set depends on your operating system and the +shell you are using. These examples are using the "bash" shell syntax. + +Note: You can also use the "debug_options" configuration setting, but the recommended method +is to set the environment variable. + +You can list the available debugging options like this: + + >SWISH_DEBUG=help ./swish.cgi >outfile + Unknown debug option 'help'. Must be one of: + basic: Basic debugging + command: Show command used to run swish + headers: Show headers returned from swish + output: Show output from swish + summary: Show summary of results + dump: Show all data available to templates + +As you work yourself down the list you will get more detail output. You can combine +options like: + + >SWISH_DEBUG=command,headers,summary ./swish.cgi >outfile + +You will be asked for an input query and the max number of results to return. You can use the defaults +in most cases. It's a good idea to redirect output to a file. Any error messages are sent to stderr, so +those will still be displayed (unless you redirect stderr, too). + +Here are some examples: + + ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile + Debug level set to: 1 + Enter a query [all]: + Using 'not asdfghjklzxcv' to match all records + Enter max results to display [1]: + + ------ Can't use DateRanges feature ------------ + + Script will run, but you can't use the date range feature + Can't locate Date/Calc.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at modules/DateRanges.pm line 107, <STDIN> line 2. + BEGIN failed--compilation aborted at modules/DateRanges.pm line 107, <STDIN> line 2. + Compilation failed in require at ./swish.cgi line 971, <STDIN> line 2. + + -------------- + Can't exec "./swish-e": No such file or directory at ./swish.cgi line 1245, <STDIN> line 2. + Child process Failed to exec './swish-e' Error: No such file or directory at ./swish.cgi line 1246, <STDIN> line 2. + Failed to find any results + +The above told me about two problems. First, it's telling me that the Date::Calc module is not installed. +The Date::Calc module is needed to use the date limiting feature of the script. + +The second problem is a bit more serious. It's saying that the script can't find the swish-e binary file. +I simply forgot to copy it. + + ~/swishtest >cp ~/swish-e/src/swish-e . + ~/swishtest >cat .swishcgi.conf + return { + title => 'Search the Apache Documentation', + date_ranges => 0, + }; + +Now, let's try again: - # Store the text of the documents within the swish index file - StoreDescription HTML <body> 100000 + ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile + Debug level set to: 1 -Adding the above to your C<swish.conf> file tells swish-e to store up to 100,000 characters from the body of each document within the -swish-e index. To display this information in search results, highlighting search terms, -use the follow configuration in C<swish.cgi>: + ---------- Read config parameters from '.swishcgi.conf' ------ + $VAR1 = { + 'date_ranges' => 0, + 'title' => 'Search the Apache Documentation' + }; + ------------------------- + Enter a query [all]: + Using 'not asdfghjklzxcv' to match all records + Enter max results to display [1]: + Found 1 results + + Can't locate TemplateDefault.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at ./swish.cgi line 608. + +Bother. I fixed the first two problems, but now there's this new error. Oh, I somehow forgot to +copy the modules directory. The obvious way to fix that is to copy the directory. But, there may +be times where you want to put the module directory in another location. So, let's modify the +F<.swishcgi.conf> file and add a "use lib" setting: + + ~/swishtest >cat .swishcgi.conf + use lib '/home/bill/swish-e/example/modules'; return { - title => 'Search the Apache documentation', - swish_binary => './swish-e', - swish_index => 'index.swish-e', - metanames => [qw/swishdefault swishdocpath swishtitle/], - display_props => [qw/swishlastmodified swishdocsize swishdocpath/], - title_property => 'swishtitle', # Not required, but recommended - description_prop=> 'swishdescription', + title => 'Search the Apache Documentation', + date_ranges => 0, + }; - name_labels => { - swishdefault => 'Body & Title', - swishtitle => 'Title', - swishrank => 'Rank', - swishlastmodified => 'Last Modified Date', - swishdocpath => 'Document Path', - swishdocsize => 'Document Size', + ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile + Debug level set to: 1 + + ---------- Read config parameters from '.swishcgi.conf' ------ + $VAR1 = { + 'date_ranges' => 0, + 'title' => 'Search the Apache Documentation' + }; + ------------------------- + Enter a query [all]: + Using 'not asdfghjklzxcv' to match all records + Enter max results to display [1]: + Found 1 results + +Now were talking. + +Here's a common problem. Everything checks out, but when you run the script you see the message: + + Swish returned unknown output + +Ok, let's find out what output it is returning: + + ~/swishtest >SWISH_DEBUG=headers,output ./swish.cgi >outfile + Debug level set to: 13 + + ---------- Read config parameters from '.swishcgi.conf' ------ + $VAR1 = { + 'swish_binary' => '/usr/local/bin/swish-e', + 'date_ranges' => 0, + 'title' => 'Search the Apache Documentation' + }; + ------------------------- + Enter a query [all]: + Using 'not asdfghjklzxcv' to match all records + Enter max results to display [1]: + usage: swish [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)] + ... + version: 2.0 + docs: http://sunsite.berkeley.edu/SWISH-E/ + + *** 9872 Failed to run swish: 'Swish returned unknown output' *** + Failed to find any results + +Oh, looks like /usr/local/bin/swish-e is version 2.0 of swish. We need 2.1-dev and above! + +=head1 Frequently Asked Questions + +Here's some common questions and answers. + +=head2 How do I change the way the output looks? + +The script uses a module to generate output. By default it uses the TemplateDefault.pm module. +The module used can be selected in the configuration file. + +If you want to make simple changes you can edit the TemplatDefault.pm module directly. If you want to +copy a module, you must also change the "package" statement at the top of the module. For example: + + cp TempateDefault.pm MyTemplateDefault.pm + +Then at the top of the module adjust the "package" line to: + + package MyTemplateDefault; + +To use this modules you need to adjust the configuration settings (either at the top of F<swish.cgi> or in +a configuration file: + + + template => { + package => 'MyTemplateDefault', }, - highlight => { - package => 'PhraseHighlight', - meta_to_prop_map => { # this maps search metatags to display properties - swishdefault => [ qw/swishtitle swishdescription/ ], - swishtitle => [ qw/swishtitle/ ], - swishdocpath => [ qw/swishdocpath/ ], + + +=head2 How do I use a templating system with swish.cgi? + +In addition to the TemplateDefault.pm module, the swish-e distribution includes two other Perl modules for +generating output using the templating systems HTML::Template and Template-Toolkit. + +Templating systems use template files to generate the HTML, and make maintaining the look of a large (or small) site +much easier. HTML::Template and Template-Toolkit are separate packages and can be downloaded from the CPAN. +See http://search.cpan.org. + +Two basic templates are provided as examples for generating output using these templating systems. +The example templates are located in the F<example> directory. +The module F<TemplateHTMLTemplate.pm> uses the file F<swish.tmpl> to generate its output, while the +module F<TemplateToolkit.pm> uses the F<search.tt> file. + +To use either of these modules you will need to adjust the "template" configuration setting. Examples for +both templating systems are provided in the configuration settings near the top of the F<swish.cgi> program. + +Use of these modules is an advanced usage of F<swish.cgi> and are provided as examples only. + +All of the output generation modules are passed a hash with the results from the search, plus other data use to create the +output page. You can see this hash by using the debugging option "dump" or by using the TemplateDumper.pm +module: + + ~/swishtest >cat .swishcgi.conf + return { + title => 'Search the Apache Documentation', + template => { + package => 'TemplateDumper', }, - } + }; - }; +And run a query. For example: + http://localhost:8000/swishtest/swish.cgi?query=install -Other C<swish.cgi> configuration settings are available, and are listed at the top of the F<swish.cgi> -script. +=head2 Why are there three different highlighting modules? +Three are three highlighting modules included with the swish-e distribution. +Each is a trade-off of speed vs. accuracy: -=back + DefaultHighlight.pm - reasonably fast, but does not highlight phrases + PhraseHighlight.pm - reasonably slow, but is reasonably accurate + SimpleHighlight.pm - fast, some phrases, but least accurate + +Eh, the default is actually "PhraseHighlight.pm". Oh well. + +Optimizations to these modules are welcome! + +=head2 My ISP doesn't provide access to the web server logs + +There are a number of options. One way it to use the CGI::Carp module. Search in the +swish.cgi script for: + + use Carp; + # Or use this instead -- PLEASE see perldoc CGI::Carp for details + # use CGI::Carp qw(fatalsToBrowser warningsToBrowser); + +And change it to look like: + + #use Carp; + # Or use this instead -- PLEASE see perldoc CGI::Carp for details + use CGI::Carp qw(fatalsToBrowser warningsToBrowser); -You should now be ready to run your search engine. Point your browser to: +This should be only for debugging purposes, as if used in production you may end up sending +quite ugly and confusing messages to your browsers. - http://www.myserver.name/cgi-bin/swish.cgi +=head2 Why does the output show (NULL)? + +The most common reason is that you did not use StoreDescription in your config file while indexing. + + StoreDescription HTML <body> 200000 + +That tells swish to store the first 200,000 characters of text extracted from the body of each document parsed +by the HTML parser. The text is stored as property "swishdescription". Running: + + ~/swishtest > ./swish-e -T index_metanames + +will display the properties defined in your index file. + +This can happen with other properties, too. +For example, this will happen when you are asking for a property to display that is not defined in swish. + + ~/swishtest > ./swish-e -w install -m 1 -p foo + # SWISH format: 2.1-dev-25 + # Search words: install + err: Unknown Display property name "foo" + . + + ~/swishtest > ./swish-e -w install -m 1 -x 'Property foo=<foo>\n' + # SWISH format: 2.1-dev-25 + # Search words: install + # Number of hits: 14 + # Search time: 0.000 seconds + # Run time: 0.038 seconds + Property foo=(NULL) + . + +To check that a property exists in your index you can run: + + ~/swishtest > ./swish-e -w not dkdk -T index_metanames | grep foo + foo : id=10 type=70 META_PROP:STRING(case:ignore) *presorted* + +Ok, in this case we see that "foo" is really defined as a property. Now let's make sure F<swish.cgi> +is asking for "foo" (sorry for the long lines): + + ~/swishtest > SWISH_DEBUG=command ./swish.cgi > /dev/null + Debug level set to: 3 + Enter a query [all]: + Using 'not asdfghjklzxcv' to match all records + Enter max results to display [1]: + ---- Running swish with the following command and parameters ---- + ./swish-e \ + -w \ + 'swishdefault=(not asdfghjklzxcv)' \ + -b \ + 1 \ + -m \ + 1 \ + -f \ + index.swish-e \ + -s \ + swishrank \ + desc \ + swishlastmodified \ + desc \ + -x \ + '<swishreccount>\t<swishtitle>\t<swishdescription>\t<swishlastmodified>\t<swishdocsize>\t<swishdocpath>\t<fos>\t<swishrank>\t<swishdocpath>\n' \ + -H \ + 9 + +If you look carefully you will see that the -x parameter has "fos" instead of "foo", so there's our problem. -adjusting the server and URL to match your system, of course. =head1 MOD_PERL @@ -1684,41 +2381,6 @@ Please post to the swish-e discussion list if you have any questions about running this script under mod_perl. - -=head1 DEBUGGING - -The key to debugging CGI scripts is to run them from the command line, not with a browser. - -First, make sure the program compiles correctly: - - > perl -c swish.cgi - swish.cgi syntax OK - -Next, simply try running the program: - - > ./swish.cgi | head - Content-Type: text/html; charset=ISO-8859-1 - - <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> - <html> - <head> - <title> - Search our site - </title> - </head> - <body> - -Now, you know that the program compiles and will run from the command line. -Next, try accessing the script from a web browser. - -If you see the contents of the CGI script instead of its output then your web server is -not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler, -and Options. - -If an error is reported (such as Internal Server Error or Forbidden) -you need to locate your web server's error_log file -and carefully read what the problem is. Contact your web administrator for help. - =head1 Spidering @@ -1835,6 +2497,8 @@ See http://www.w3.org/Security/Faq/www-security-faq.html +Security on Windows questionable. + =head1 SUPPORT The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example @@ -1844,11 +2508,11 @@ http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_ -Please do not contact the author directly. +Please do not contact the author or any of the swish-e developers directly. =head1 LICENSE -swish.cgi $Revision: 1.2 $ Copyright (C) 2001 Bill Moseley [EMAIL PROTECTED] +swish.cgi $Revision: 1.3 $ Copyright (C) 2001 Bill Moseley [EMAIL PROTECTED] Example CGI program for searching with SWISH-E 1.4 +16 -2 modperl-docs/src/search/swish.conf Index: swish.conf =================================================================== RCS file: /home/cvs/modperl-docs/src/search/swish.conf,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- swish.conf 4 Feb 2002 09:22:27 -0000 1.3 +++ swish.conf 3 Mar 2002 11:27:22 -0000 1.4 @@ -1,5 +1,19 @@ IndexDir ./spider.pl DefaultContents HTML2 StoreDescription HTML2 <body> 100000 -MetaNames swishtitle swishdocpath -SwishProgParameters default http://localhost/modperl-site/ +MetaNames swishtitle swishdocpath section + +# This is to make the URLs shorter in the display. +ReplaceRules remove http://perl.apache.org + +# For example, on my test setup I might do something like: +# Need ".." since search is on level down + +ReplaceRules replace http://mardy:40994/dst_html .. + + +UndefinedMetaTags ignore + +#BuzzWords in highlighting -- +#How about counting highlighted terms individually in the highlight module +#so every term is highlighted at least once, with a total of say five.
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]