[Lucene-dev] lucene integrated with websphinx web spider

Scott Hasse Mon, 24 Sep 2001 20:00:46 -0700

Hi, all.

I have done some quick work integrating the websphinx web crawler with lucene. I found that it was fairly trivial, and have enclosed some example source code in this mail. You will need the websphix jar file from

http://www-2.cs.cmu.edu/~rcm/websphinx/index.html

Obviously, this example could be improved quite a bit, but it is a good starting point for someone wanting to develop a spider.

Hope this helps,

Scott

********************************************

import websphinx.*;
import com.lucene.index.*;
import com.lucene.analysis.*;
import java.net.*;
import java.io.*;

public class Index {
    public static void main(String[] args) {
        try {
            IndexWriter writer = new IndexWriter("index", new StopAnalyzer(), true);
            writer.mergeFactor = 20;
            IndexingCrawler c = new IndexingCrawler(writer, "http://www.yahoo.com");
            c.run();
            writer.optimize();
            writer.close();
        } catch (MalformedURLException e) {
            e.printStackTrace(System.out);
        } catch (IOException e) {
            e.printStackTrace(System.out);
        }
    }
}

**********************************************

The actual crawler is below

**********************************************

import websphinx.*;
import com.lucene.document.*;
import com.lucene.analysis.*;
import com.lucene.index.*;
import java.io.*;
import java.net.*;

public class IndexingCrawler extends Crawler {

private IndexWriter writer;

    public IndexingCrawler(IndexWriter writer, String docroot) {
        super();
        try {
            this.setRoot(new Link(docroot));
        } catch (MalformedURLException e) {
            this.setRoot(null);
        }
        this.writer = writer;
        this.setSynchronous(true);
        this.setDomain(Crawler.SERVER);
    }

    public void visit(Page p) {
        boolean index = false;
        System.out.println("Visiting [" + p.getURL() + "]");
        index(p);
        System.out.println("    Done.");
    }

    public void index(Page p) {
        StringBuffer contents = new StringBuffer();
        Document doc = new Document();
        doc.add(Field.Text("path", p.getURL().toString()));
        doc.add(Field.Keyword("modified",
     DateField.timeToString(p.getLastModified())));

        if (p.getTitle() != null) {
            doc.add(Field.Text("title", p.getTitle()));
        }

        System.out.println("    Indexing...");
        System.out.println("        depth [" + p.getDepth() + "]");
        System.out.println("        title [" + p.getTitle() + "]");
        System.out.println("        modified [" + p.getLastModified() + "]");
        Element[] elements = p.getElements();
        for (int i = 0; i < elements.length; i++) {
            if (elements[i].getTagName().equalsIgnoreCase("meta")) {
                String name = elements[i].getHTMLAttribute("name", "");
                String content = elements[i].getHTMLAttribute("content", "");
                if (!name.equals("")) {
                    doc.add(Field.Text(name, content));
                    System.out.println("        meta [" + name + ":" + content + "]");
                }
            }
        }
        Text[] texts = p.getWords();
        for (int i = 0; i < texts.length; i++) {
            contents.append(texts[i].toText());
            contents.append(" ");
        }
        doc.add(Field.Text("contents", contents.toString()));
        try {
            writer.addDocument(doc);
        } catch (IOException e) {
            throw new RuntimeException(e.toString());
        }
    }

    public void noindex(Page p) {
        System.out.println("    Skipping...");
    }
}

*****************************

[Lucene-dev] lucene integrated with websphinx web spider

Reply via email to