stas        02/02/04 01:22:27

  Modified:    src/search swish.conf
  Added:       src/search README SwishSpiderConfig.pl
  Log:
  - Add README with search instructions (how to index + search)
  - add Bill Moseleys sections parser
  - tidy up
  
  Revision  Changes    Path
  1.3       +1 -0      modperl-docs/src/search/swish.conf
  
  Index: swish.conf
  ===================================================================
  RCS file: /home/cvs/modperl-docs/src/search/swish.conf,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- swish.conf        31 Jan 2002 01:51:50 -0000      1.2
  +++ swish.conf        4 Feb 2002 09:22:27 -0000       1.3
  @@ -2,3 +2,4 @@
   DefaultContents HTML2
   StoreDescription HTML2 <body> 100000
   MetaNames swishtitle swishdocpath
  +SwishProgParameters default http://localhost/modperl-site/
  
  
  
  1.1                  modperl-docs/src/search/README
  
  Index: README
  ===================================================================
  This document explains how to setup swish-e, index and search the
  perl.apache.org site.
  
  Setting up swish-e:
  -------------------
  
  - Install the dev version of swish-e.  Currently we use SWISH-E 2.1-dev-25.
  
  - make sure that swish-e is in the PATH, so the apps will be able to
     find it
  
  Indexing:
  ---------
  
  1. normally build the site:
  
    % bin/build -f (-d to build pdfs)
  
  which among other things creates the dir: dst_html/search
  
  2. check that swish.conf points to the right base URL, e.g.:
  
    SwishProgParameters default http://localhost/modperl-site/
  
  3. Index the site
  
    % cd dst_html/search
    % swish-e -S prog -c swish.conf
  
  You should see something like:
  
    Indexing Data Source: "External-Program"
    Indexing "./spider.pl"
    ./spider.pl: Reading parameters from 'default'
    
    Summary for: http://localhost/modperl-site/
        Duplicates:     5,357  (281.9/sec)
    Off-site links:     1,851  (97.4/sec)
       Total Bytes: 8,107,112  (426690.1/sec)
        Total Docs:       351  (18.5/sec)
       Unique URLs:       419  (22.1/sec)
    Removing very common words...
    no words removed.
    Writing main index...
    Sorting words ...
    Sorting 10599 words alphabetically
    Writing header ...
    Writing index entries ...
      Writing word text: Complete
      Writing word hash: Complete
      Writing word data: Complete
    10599 unique words indexed.
    5 properties sorted.                                              
    351 files indexed.  8107112 total bytes.  307356 total words.
    Elapsed time: 00:00:20 CPU time: 00:00:02
    Indexing done!
  
  Now you can search...
  
  Searching:
  ----------
  
  1. Go to the search page: ..../search/search.html
  
  2. Search
  
  If something doesn't work check the error_log file on the server the
  swish.cgi is running on. The most common error is that the swish-e
  binary cannot be found by the swish.cgi script. Remember that CGI may
  be running under a different username and therefore may not have the
  same PATH env variable.
  
  
  Swish-e related adjustments to the template:
  --------------------------------------------
  
  - since we want to index only the real content, we use:
    <!-- Swishcommand index -->,
         only content here will indexed
    <!-- Swishcommand noindex -->,
  
  
  
  
  
  
  1.1                  modperl-docs/src/search/SwishSpiderConfig.pl
  
  Index: SwishSpiderConfig.pl
  ===================================================================
  # this is the modified default spider config file that comes with swish-e.
  #
  # a few custom callbacks are located after the @servers definition section.
  
  @servers = (
      {
          base_url        => 'http://mardy:40994/dst_html/index.html',
  
          # Debugging -- see perldoc spider.pl
  
          #base_url        => 
'http://mardy.hank.org:40994/dst_html/docs/guide/index.html',
          #max_depth => 1,
          #debug => DEBUG_HEADERS,
          #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
          #debug => DEBUG_LINKS,
  
          keep_alive      => 1,         # enable keep alives requests
          email           => '[EMAIL PROTECTED]',
  
          use_md5         => 1,    # catch duplicates ( / and /index.html )
  
          delay_min       => .0001,
  
          # Ignore images files
          test_url        => sub { $_[0]->path !~ /\.(?:gif|jpe?g|.png)$/i },
  
          # Only index text/html
          test_response   => sub { return $_[2]->content_type =~ m[text/html] },
  
          # split content - comment out to disable splitting
          filter_content  => \&split_page,
  
          # optionally validate external links
          validate_links => 1,
      },
  
  );
  
  use HTML::TreeBuilder;
  use HTML::Element;
  
  sub split_page {
  
      my %params;
      @params{ qw/ uri server response content / } = @_;
      $params{found} = 0;
  
  
      my $tree = HTML::TreeBuilder->new;
      $tree->parse( ${$params{content}} );  # Why not allow a scalar ref?
      $tree->eof;
  
      my $head = $tree->look_down( '_tag', 'head' );
  
      for my $section ( $tree->look_down( '_tag', 'div', 'class', 
'index_section' ) ) {
          create_page( $head->clone, $section->clone, \%params )
      }
  
      $tree->delete;
  
      return !$params{found};  # tell spider.pl to not index the page
  }
  
  sub create_page {
      my ( $head, $section, $params ) = @_;
  
      my $uri = $params->{uri};
  
      my $section_name = 'Unknown_Section';
      my $name = $section->look_down( '_tag', 'a',
                                      sub { defined($_[0]->attr('name')) } );
  
      if ( $name ) {
          $section_name = $name->attr('name');
          $uri->fragment( $section_name );
      }
  
      my $text_title = $section_name;
      $text_title =~ tr/_/ /s;
  
      my $title = $head->look_down('_tag', 'title');
  
      if ( $title ) {
          $title->push_content(": $text_title");
      } else {
          my $title = HTML::Element->new('title');
          $title->push_content(": $text_title");
          $head->push_content( $title );
      }
  
      my $body = HTML::Element->new('body');
      my $doc  = HTML::Element->new('html');
  
      $body->push_content( $section );
      $doc->push_content( $head, $body );
  
      my $new_content = $doc->as_HTML(undef,"\t");
      output_content( $params->{server}, \$new_content,
                      $uri, $params->{response} );
  
      $uri->fragment(undef);
  
      $params->{found}++;  # set flag;
  
      $doc->delete;
  }
  
  
  1;
  
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to