On 5/9/07, Ralph Schindler <[EMAIL PROTECTED]> wrote:
Does anyone have a good reference for creating or using a spider to create an index of ones site for use with Zend_Search_Lucene? I see where I can create an indexer in php, but I suppose I am looking for something that will do auto link discovery and be able to run via cron periodically.
There's Limb3 WEB_SPIDER package which allows you to accomplish that. Here's an example of working cron script we(in my company) use in one of our projects: index.php(usage "index.php http://mysite/ /path/to/index"): <?php //it takes a path to site url and index file as arguments if(!isset($argv[1])) die("Index starting uri not specified!\n"); if(!isset($argv[2])) die("Index file not specified!\n"); set_time_limit(0); ini_set('memory_limit', '512M'); require_once('limb/net/src/lmbUri.class.php'); require_once('limb/web_spider/src/lmbWebSpider.class.php'); require_once('limb/web_spider/src/lmbUriFilter.class.php'); require_once('limb/web_spider/src/lmbContentTypeFilter.class.php'); require_once('limb/web_spider/src/lmbSearchIndexingObserver.class.php'); require_once('limb/search/src/indexer/lmbSearchTextNormalizer.class.php'); require_once('limb/web_spider/src/lmbUriNormalizer.class.php'); //this is a some custom indexer, for better readability I extracted it //from this script, see the code below require_once('MyZendSearchIndexer.class.php'); $uri = new lmbUri($argv[1]); $indexer = new MyZendSearchIndexer($argv[2], new lmbSearchTextNormalizer()); $indexer->useNOINDEX(); $observer = new lmbSearchIndexingObserver($indexer); $content_type_filter = new lmbContentTypeFilter(); $content_type_filter->allowContentType('text/html'); $uri_filter = new lmbUriFilter(); $uri_filter->allowHost($uri->getHost()); $uri_filter->allowProtocol('http'); $uri_filter->allowPathRegex('~.*~'); $normalizer = new lmbUriNormalizer(); $normalizer->stripQueryItem('PHPSESSID'); $spider = new lmbWebSpider(); $spider->setContentTypeFilter($content_type_filter); $spider->setUriFilter($uri_filter); $spider->setUriNormalizer($normalizer); $spider->registerObserver($observer); $spider->crawl($uri); ?> MyZendSearchIndexer.class.php <?php require_once('Zend/Search/Lucene.php'); class MyZendSearchIndexer { protected $normalizer = null; protected $left_bound = '<!-- no index start -->'; protected $right_bound = '<!-- no index end -->'; protected $use_noindex = false; protected $index; protected $index_file; function __construct($index_file, $normalizer) { $this->index_file = $index_file; $this->normalizer = $normalizer; } function useNOINDEX($status = true) { $this->use_noindex = $status; } function index($uri, $content) { $title = $this->_extractTitle($content); $content = $this->_getIndexedContent($content); $content = $this->normalizer->process($content); $doc = new Zend_Search_Lucene_Document(); $doc->addField(Zend_Search_Lucene_Field::Text('uri', $uri->toString())); $field = Zend_Search_Lucene_Field::Text('title', $title); $field->boost = 1.5; $doc->addField($field); $doc->addField(Zend_Search_Lucene_Field::Text('content', $content); $index = $this->_getIndex(); @$index->addDocument($doc); } function _getIndex() { if(!$this->index) $this->index = Zend_Search_Lucene::create($this->index_file); return $this->index; } function _getIndexedContent($content) { if(!$this->use_noindex) return $content; $regex = '~' . preg_quote($this->left_bound) . '(.*?)' . preg_quote($this->right_bound) . '~s'; return preg_replace($regex, ' ', $content); } function _extractTitle($content) { $regex = '~<title>([^<]*)</title>~'; if(preg_match($regex, $content, $matches)) return $matches[1]; else return ''; } } ?> Sorry for such a long mail, hope you find it useful. Actually there's a much more verbose tutorial on how to integrate Limb3 with ZF Search(http://wiki.limb-project.com/doku.php?id=limb3_2007_2:ru:tutorials:zend_search) but it's in Russian...
Thanks, Ralph
-- Best regards, Pavel
