I am working from the crawler example code found here:
http://www.google.com/url?sa=t&ct=res&cd=1&url=http%3A%2F%2Fwww.zend.com%2Ftopics%2FImprove-your-PHP-Applications-Search-Capabilities-with-Lucene.pdf&ei=y2qgSIiJHpHovAWBn9z-BQ&usg=AFQjCNHBU8vre59KVlgn0fV2O3h6B6bKMw&sig2=QTi5S9NULQDB4t30admvFQ
For some reason the contents from the page aren't being indexed. In my log
files everything stops at: "Before add document"
None of the log messages after are showing up.
Is there something that I am missing:
<code>
public function crawlerAction() {
$this->_helper->viewRenderer->setNoRender();
$this->_logger->info("Crawler initialized");
/**
* Setup Zend_Http_Client
*/
$client = new Zend_Http_Client();
$client->setConfig(array('timeout' => 30));
$indexpath = $this->_globalConfig->lucene->index;
try {
$this->_index = Zend_Search_Lucene::open($indexpath);
$this->_logger->info("Opened existing index in
{$indexpath}");
} catch (Zend_Search_Lucene_Exception $e) {
try {
$index = Zend_Search_Lucene::create($indexpath);
$this->_logger->info("Created new index in {$indexpath}
");
/**
* If both fail,give up and show errormessage
*/
} catch (Zend_Search_Lucene_Exception $e) {
$this->_logger->err("Failed opening or creating index in
{$indexpath}");
$this->_logger->err($e->getMessage());
print "Unable to open or create
index:{$e->getMessage()}";
exit(1);
}
}
/**
* Setup the targets array
*/
$targets = array($this->_globalConfig->lucene->url);
$this->_logger->info("Target count: ". count($targets));
/**
* Start iterating
*/
for ($i = 0; $i < count($targets); $i++) {
/**
* Fetch content with HTTPClient
*/
$client->setUri($targets[$i]);
$response = $client->request();
if ($response->isSuccessful()) {
$body = $response->getBody();
$this->_logger->info("Fetched ".strlen($body)." bytes from
{$targets[$i]}");
$body_checksum = md5($body);
$this->_logger->info("Body checksum {$body_checksum}");
$this->_logger->info("Index: ".Zend_Debug::dump($this->_index, "",
false));
$hits = $this->_index->find('url:'.$targets[$i]);
$this->_logger->info("Hits: ".count($hits));
$matched = false;
foreach ($hits as $hit){
$this->_logger->info("Hit md5 {$hit->md5} : checksum
{$body_checksum}");
if ($hit->md5 == $body_checksum) {
if ($matched == true) {
$index->delete($hit->id);
$matched = true;
}
} else {
$this->_logger->info("{$targets[$i]} is out of date and
needs
reindexing");
$index->delete($hit);
}
if ($matched){
$this->_logger->info($targets[$i]." is uptodate,
skipping");
continue;
}
}
/**
* Create document
*/
$doc = Zend_Search_Lucene_Document_Html::loadHTML($body);
$this->_logger->info("Url {$targets[$i]}");
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('url',
$targets[$i]));
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('md5',
$body_checksum));
$this->_logger->info("Before add document");
/**
* Index
*/
$this->_index->addDocument($doc);
$this->_logger->info("After added the doc");
$this->_logger->info("Indexed {$targets[$i]}");
/**
* Fetch new links
*/
$links = $doc->getLinks();
$this->_logger->info("Get links".Zend_Debug::dump($links));
foreach($links as $link){
if (strpos($link,
$this->_globalConfig->lucene->index)&&(!in_array($link, $targets))) {
$targets[] = $link;
}
}
} else {
$this->_logger->warn("Requesting {$url} returned HTTP
{$response->getStatus()}");
}
}
$this->_logger->info("Iterated over ".count($targets)."documents");
$this->_logger->info("Optimizing index...");
$index->optimize();
$this->_logger->info("Done. Index now
contains".$index->numDocs()."documents");
$this->_logger->info("Crawling completed");
}
</code>
--
View this message in context:
http://www.nabble.com/Lucene-Crawler-Not-Adding-to-Index-tp18929433p18929433.html
Sent from the Zend Framework mailing list archive at Nabble.com.