|
Hi, all.
I have done some quick work integrating the
websphinx web crawler with lucene. I found that it was fairly trivial, and
have enclosed some example source code in this mail. You will need the
websphix jar file from
Obviously, this example could be improved quite a
bit, but it is a good starting point for someone wanting to develop a
spider.
Hope this helps,
Scott
********************************************
import websphinx.*;
import com.lucene.index.*; import com.lucene.analysis.*; import java.net.*; import java.io.*; public class Index {
public static void main(String[] args) { try { IndexWriter writer = new IndexWriter("index", new StopAnalyzer(), true); writer.mergeFactor = 20; IndexingCrawler c = new IndexingCrawler(writer, "http://www.yahoo.com"); c.run(); writer.optimize(); writer.close(); } catch (MalformedURLException e) { e.printStackTrace(System.out); } catch (IOException e) { e.printStackTrace(System.out); } } } **********************************************
The actual crawler is below
**********************************************
import websphinx.*;
import com.lucene.document.*; import com.lucene.analysis.*; import com.lucene.index.*; import java.io.*; import java.net.*; public class IndexingCrawler extends Crawler
{
private IndexWriter
writer;
public
IndexingCrawler(IndexWriter writer, String docroot)
{
super(); try { this.setRoot(new Link(docroot)); } catch (MalformedURLException e) { this.setRoot(null); } this.writer = writer; this.setSynchronous(true); this.setDomain(Crawler.SERVER); } public void visit(Page p)
{
boolean index = false; System.out.println("Visiting [" + p.getURL() + "]"); index(p); System.out.println(" Done."); } public void index(Page p)
{
StringBuffer contents = new StringBuffer(); Document doc = new Document(); doc.add(Field.Text("path", p.getURL().toString())); doc.add(Field.Keyword("modified", DateField.timeToString(p.getLastModified()))); if
(p.getTitle() != null)
{
doc.add(Field.Text("title", p.getTitle())); }
System.out.println("
Indexing...");
System.out.println(" depth [" + p.getDepth() + "]"); System.out.println(" title [" + p.getTitle() + "]"); System.out.println(" modified [" + p.getLastModified() + "]"); Element[] elements = p.getElements(); for (int i = 0; i < elements.length; i++) { if (elements[i].getTagName().equalsIgnoreCase("meta")) { String name = elements[i].getHTMLAttribute("name", ""); String content = elements[i].getHTMLAttribute("content", ""); if (!name.equals("")) { doc.add(Field.Text(name, content)); System.out.println(" meta [" + name + ":" + content + "]"); } } } Text[] texts = p.getWords(); for (int i = 0; i < texts.length; i++) { contents.append(texts[i].toText()); contents.append(" "); } doc.add(Field.Text("contents", contents.toString())); try { writer.addDocument(doc); } catch (IOException e) { throw new RuntimeException(e.toString()); } } public void noindex(Page p)
{
System.out.println(" Skipping..."); } } *****************************
|
