Lucene ,highlighting and NullPointerException

morofiler Fri, 04 Mar 2011 09:07:44 -0800

Hello Everybody ,

I am trying to highlight some results . I index the body (the text) of my
documents in the field "contents"and when I try to highilight using
highlighter.getBestFragment(...) I get a NullPointerException .


But when,for exemple I try to highlight the fileName it works properly.
I know since I "store" Everything in one field with the fileReader or
(ParsingReader) my text is tokenized which is different from a file name .

Here's my code ,please help me .


package xxxxxx;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParsingReader;

public class Indexer {

    static long start = 0;

    public static void main(String[] args) throws Exception {
        System.out.println("l'index se trouve Ã  " + args[0]);
        System.out.println("le dossier ou s'effectue l'indexation est :" +
args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java " +
Indexer.class.getName()
                    + "  ");
        }

        String indexDir = args[0];
        String dataDir = args[1];


        start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());


        } finally {

            indexer.close();
        }

        long end = System.currentTimeMillis();
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }
    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException, InterruptedException
{
        Directory dir = FSDirectory.open(new File(indexDir));

        writer = new IndexWriter(dir, new
StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(true);
    }

    public void close() throws IOException {
        writer.optimize();
        writer.close();
    }

    public int index(String dataDir, FileFilter filter) throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {

            if (!f.isDirectory() && !f.isHidden() && f.exists() &&
f.canRead() && (filter == null || filter.accept(f))) {

                if (!(f.getCanonicalPath().endsWith("~"))) {
                    indexFile(f);
                }
            } else {
                index(f.toString(), filter);
            }
        }
        return writer.numDocs();
    }

    private static class TextFilesFilter implements FileFilter {

        public boolean accept(File path) {
            return true;
        }
    }

    protected Document getDocument(File f) throws Exception {
       // FileReader frf = new FileReader(f);
        Document doc = new Document();
        Reader reader = new ParsingReader(f);

        doc.add(new Field("contents", reader,
Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.ANALYZED ));
        doc.add(new Field("fullpath", f.getCanonicalPath(),Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS));
        return doc;
    }

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
        System.out.println(System.currentTimeMillis() - start);
    }
}

---------------------------------------------------------------------------------------

Lucene ,highlighting and NullPointerException

Hello Everybody ,

I am trying to highlight some results . I index the body (the text) of my
documents in the field "contents"and when I try to highilight using
highlighter.getBestFragment(...) I get a NullPointerException .

But when,for exemple I try to highlight the fileName it works properly.
I know since I "store" Everything in one field with the fileReader or
(ParsingReader) my text is tokenized which is different from a file name .

Here's my code ,please help me .




package xxxxxxxxxxxxxxxxxxxx;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {

    public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException, InvalidTokenOffsetsException {
        System.out.println("endroit ou se situe l'index " + args[0]);
        System.out.println(args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName()
                    + "  ");
        }

        String indexDir = args[0];
        String q = args[1];
        search(indexDir, q);
    }
    

    public static void search(String indexDir, String q) throws IOException,
ParseException, InvalidTokenOffsetsException {
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexSearcher indexSearcher = new IndexSearcher(dir);
        QueryParser parserC = new QueryParser(Version.LUCENE_30, "contents",
new StandardAnalyzer(Version.LUCENE_30));
  //      QueryParser parserN = new QueryParser(Version.LUCENE_30,
"filename", new StandardAnalyzer(Version.LUCENE_30));
        QueryParser parserP = new QueryParser(Version.LUCENE_30, "fullpath",
new StandardAnalyzer(Version.LUCENE_30));
        parserC.setDefaultOperator(QueryParser.Operator.OR);
    //    parserN.setDefaultOperator(QueryParser.Operator.OR);
        parserC.setPhraseSlop(10);
      //  parserN.setPhraseSlop(10);
        DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(6);

        Query query = new MultiFieldQueryParser(Version.LUCENE_30, new
String[]{"contents", "filename"},
                new CustomAnalyzer()).parse(q);

        Query queryC = parserC.parse(q);
        //Query queryN = parserN.parse(q);
        dmq.add(queryC);
        //dmq.add(queryN);
        //     dmq.add(query)      ;
        QueryScorer scorer = new QueryScorer(dmq, "contents");
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));


        System.out.println(query.toString());
        long start = System.currentTimeMillis();
        TopDocs hits = indexSearcher.search(dmq, 15);
        System.out.println(hits.totalHits);
        long end = System.currentTimeMillis();
        System.err.println("Found " + hits.totalHits
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '"
                + q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.print(scoreDoc.score);
            System.out.println(doc.get("fullpath"));

 String contents = doc.get("contents"); // I am pretty sure the mistake is
here , contents is always Null
 //But what can I do to make this thing work ?
            TokenStream stream =
                   
TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(),
                    scoreDoc.doc,
                    "contents",
                    doc,
                    new StandardAnalyzer(Version.LUCENE_30));
            String fragment =
                    highlighter.getBestFragment(stream, contents);
            System.out.println(fragment);
        }
        indexSearcher.close();
    }
}
-----------------------------------------------------------------------------------------
run:
endroit ou se situe l'index /home/ghart/index
le humus
(contents:le filename:le) (contents:humus filename:humus)
6
Found 6 document(s) (in 21 milliseconds) that matched query 'le humus':
2.0974472/home/ghart/test/hook
Exception in thread "main" java.lang.NullPointerException
        at
org.apache.lucene.search.highlight.SimpleSpanFragmenter.start(SimpleSpanFragmenter.java:103)
        at
org.apache.lucene.search.highlight.Highlighter.getBestTextFragments(Highlighter.java:216)
        at
org.apache.lucene.search.highlight.Highlighter.getBestFragments(Highlighter.java:158)
        at
org.apache.lucene.search.highlight.Highlighter.getBestFragment(Highlighter.java:104)
        at indexer.Searcher.search(Searcher.java:98)
        at indexer.Searcher.main(Searcher.java:45)
Java Result: 1
GÃ‰NÃ‰RATION TERMINÃ‰E (durÃ©e totaleÂ  1 seconde)


--
View this message in context: 
http://lucene.472066.n3.nabble.com/Lucene-highlighting-and-NullPointerException-tp2634525p2634525.html
Sent from the Lucene - General mailing list archive at Nabble.com.

Lucene ,highlighting and NullPointerException

Reply via email to