stas 02/02/04 01:22:27 Modified: src/search swish.conf Added: src/search README SwishSpiderConfig.pl Log: - Add README with search instructions (how to index + search) - add Bill Moseleys sections parser - tidy up Revision Changes Path 1.3 +1 -0 modperl-docs/src/search/swish.conf Index: swish.conf =================================================================== RCS file: /home/cvs/modperl-docs/src/search/swish.conf,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- swish.conf 31 Jan 2002 01:51:50 -0000 1.2 +++ swish.conf 4 Feb 2002 09:22:27 -0000 1.3 @@ -2,3 +2,4 @@ DefaultContents HTML2 StoreDescription HTML2 <body> 100000 MetaNames swishtitle swishdocpath +SwishProgParameters default http://localhost/modperl-site/ 1.1 modperl-docs/src/search/README Index: README =================================================================== This document explains how to setup swish-e, index and search the perl.apache.org site. Setting up swish-e: ------------------- - Install the dev version of swish-e. Currently we use SWISH-E 2.1-dev-25. - make sure that swish-e is in the PATH, so the apps will be able to find it Indexing: --------- 1. normally build the site: % bin/build -f (-d to build pdfs) which among other things creates the dir: dst_html/search 2. check that swish.conf points to the right base URL, e.g.: SwishProgParameters default http://localhost/modperl-site/ 3. Index the site % cd dst_html/search % swish-e -S prog -c swish.conf You should see something like: Indexing Data Source: "External-Program" Indexing "./spider.pl" ./spider.pl: Reading parameters from 'default' Summary for: http://localhost/modperl-site/ Duplicates: 5,357 (281.9/sec) Off-site links: 1,851 (97.4/sec) Total Bytes: 8,107,112 (426690.1/sec) Total Docs: 351 (18.5/sec) Unique URLs: 419 (22.1/sec) Removing very common words... no words removed. Writing main index... Sorting words ... Sorting 10599 words alphabetically Writing header ... Writing index entries ... Writing word text: Complete Writing word hash: Complete Writing word data: Complete 10599 unique words indexed. 5 properties sorted. 351 files indexed. 8107112 total bytes. 307356 total words. Elapsed time: 00:00:20 CPU time: 00:00:02 Indexing done! Now you can search... Searching: ---------- 1. Go to the search page: ..../search/search.html 2. Search If something doesn't work check the error_log file on the server the swish.cgi is running on. The most common error is that the swish-e binary cannot be found by the swish.cgi script. Remember that CGI may be running under a different username and therefore may not have the same PATH env variable. Swish-e related adjustments to the template: -------------------------------------------- - since we want to index only the real content, we use: <!-- Swishcommand index -->, only content here will indexed <!-- Swishcommand noindex -->, 1.1 modperl-docs/src/search/SwishSpiderConfig.pl Index: SwishSpiderConfig.pl =================================================================== # this is the modified default spider config file that comes with swish-e. # # a few custom callbacks are located after the @servers definition section. @servers = ( { base_url => 'http://mardy:40994/dst_html/index.html', # Debugging -- see perldoc spider.pl #base_url => 'http://mardy.hank.org:40994/dst_html/docs/guide/index.html', #max_depth => 1, #debug => DEBUG_HEADERS, #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO, #debug => DEBUG_LINKS, keep_alive => 1, # enable keep alives requests email => '[EMAIL PROTECTED]', use_md5 => 1, # catch duplicates ( / and /index.html ) delay_min => .0001, # Ignore images files test_url => sub { $_[0]->path !~ /\.(?:gif|jpe?g|.png)$/i }, # Only index text/html test_response => sub { return $_[2]->content_type =~ m[text/html] }, # split content - comment out to disable splitting filter_content => \&split_page, # optionally validate external links validate_links => 1, }, ); use HTML::TreeBuilder; use HTML::Element; sub split_page { my %params; @params{ qw/ uri server response content / } = @_; $params{found} = 0; my $tree = HTML::TreeBuilder->new; $tree->parse( ${$params{content}} ); # Why not allow a scalar ref? $tree->eof; my $head = $tree->look_down( '_tag', 'head' ); for my $section ( $tree->look_down( '_tag', 'div', 'class', 'index_section' ) ) { create_page( $head->clone, $section->clone, \%params ) } $tree->delete; return !$params{found}; # tell spider.pl to not index the page } sub create_page { my ( $head, $section, $params ) = @_; my $uri = $params->{uri}; my $section_name = 'Unknown_Section'; my $name = $section->look_down( '_tag', 'a', sub { defined($_[0]->attr('name')) } ); if ( $name ) { $section_name = $name->attr('name'); $uri->fragment( $section_name ); } my $text_title = $section_name; $text_title =~ tr/_/ /s; my $title = $head->look_down('_tag', 'title'); if ( $title ) { $title->push_content(": $text_title"); } else { my $title = HTML::Element->new('title'); $title->push_content(": $text_title"); $head->push_content( $title ); } my $body = HTML::Element->new('body'); my $doc = HTML::Element->new('html'); $body->push_content( $section ); $doc->push_content( $head, $body ); my $new_content = $doc->as_HTML(undef,"\t"); output_content( $params->{server}, \$new_content, $uri, $params->{response} ); $uri->fragment(undef); $params->{found}++; # set flag; $doc->delete; } 1;
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]