On 5/9/07, Ralph Schindler <[EMAIL PROTECTED]> wrote:
Does anyone have a good reference for creating or using a spider to
create an index of ones site for use with Zend_Search_Lucene?  I see
where I can create an indexer in php, but I suppose I am looking for
something that will do auto link discovery and be able to run via cron
periodically.

There's Limb3 WEB_SPIDER package which allows you to accomplish that.
Here's an example of working cron script we(in my company) use in one
of our projects:

index.php(usage "index.php http://mysite/ /path/to/index"):

<?php
//it takes a path to site url and index file as arguments
if(!isset($argv[1]))
 die("Index starting uri not specified!\n");

if(!isset($argv[2]))
 die("Index file not specified!\n");

set_time_limit(0);
ini_set('memory_limit', '512M');

require_once('limb/net/src/lmbUri.class.php');
require_once('limb/web_spider/src/lmbWebSpider.class.php');
require_once('limb/web_spider/src/lmbUriFilter.class.php');
require_once('limb/web_spider/src/lmbContentTypeFilter.class.php');
require_once('limb/web_spider/src/lmbSearchIndexingObserver.class.php');
require_once('limb/search/src/indexer/lmbSearchTextNormalizer.class.php');
require_once('limb/web_spider/src/lmbUriNormalizer.class.php');

//this is a some custom indexer, for better readability I extracted it
//from this script, see the code below
require_once('MyZendSearchIndexer.class.php');

$uri = new lmbUri($argv[1]);

$indexer = new MyZendSearchIndexer($argv[2], new lmbSearchTextNormalizer());
$indexer->useNOINDEX();

$observer = new lmbSearchIndexingObserver($indexer);

$content_type_filter = new lmbContentTypeFilter();
$content_type_filter->allowContentType('text/html');

$uri_filter = new lmbUriFilter();
$uri_filter->allowHost($uri->getHost());
$uri_filter->allowProtocol('http');
$uri_filter->allowPathRegex('~.*~');

$normalizer = new lmbUriNormalizer();
$normalizer->stripQueryItem('PHPSESSID');

$spider = new lmbWebSpider();
$spider->setContentTypeFilter($content_type_filter);
$spider->setUriFilter($uri_filter);
$spider->setUriNormalizer($normalizer);
$spider->registerObserver($observer);

$spider->crawl($uri);
?>

MyZendSearchIndexer.class.php

<?php
require_once('Zend/Search/Lucene.php');
class MyZendSearchIndexer
{
 protected $normalizer = null;

 protected $left_bound = '<!-- no index start -->';
 protected $right_bound = '<!-- no index end -->';
 protected $use_noindex = false;

 protected $index;
 protected $index_file;

 function __construct($index_file, $normalizer)
 {
   $this->index_file = $index_file;
   $this->normalizer = $normalizer;
 }

 function useNOINDEX($status = true)
 {
   $this->use_noindex = $status;
 }

 function index($uri, $content)
 {
   $title = $this->_extractTitle($content);
   $content = $this->_getIndexedContent($content);

   $content = $this->normalizer->process($content);

   $doc = new Zend_Search_Lucene_Document();

   $doc->addField(Zend_Search_Lucene_Field::Text('uri', $uri->toString()));

   $field = Zend_Search_Lucene_Field::Text('title', $title);
   $field->boost = 1.5;
   $doc->addField($field);

   $doc->addField(Zend_Search_Lucene_Field::Text('content', $content);

   $index = $this->_getIndex();
   @$index->addDocument($doc);
 }

 function _getIndex()
 {
   if(!$this->index)
     $this->index = Zend_Search_Lucene::create($this->index_file);
   return $this->index;
 }

 function _getIndexedContent($content)
 {
   if(!$this->use_noindex)
     return $content;

   $regex = '~' .
            preg_quote($this->left_bound) .
            '(.*?)' .
            preg_quote($this->right_bound) .
            '~s';

   return preg_replace($regex, ' ', $content);
 }

 function _extractTitle($content)
 {
   $regex = '~<title>([^<]*)</title>~';
   if(preg_match($regex, $content, $matches))
     return $matches[1];
   else
     return '';
 }
}
?>

Sorry for such a long mail, hope you find it useful.

Actually there's a much more verbose tutorial on how to integrate
Limb3 with ZF 
Search(http://wiki.limb-project.com/doku.php?id=limb3_2007_2:ru:tutorials:zend_search)
but it's in Russian...



Thanks,
Ralph



--
Best regards, Pavel

Reply via email to