Custom Analyzer Not Called When Indexing

Yann-Erwan Perio Sun, 09 Sep 2012 02:56:20 -0700

Hello,

This is my first time writing to the list. I am a Java developer,
writing a personal project using Lucene, and so far have been very
happy with the library (v4BETA). However, I have recently decided to
build and use a custom analyzer, and could not make it work with
IndexWriter. I must be missing something obvious, but all my searches
on the web and my unit tests were to no avail. I would be thankful if
you could point me out to the right direction. You will find below
some test classes demonstrating my problem.


Basically, I have created a custom analyzer, which tokenizes its input
to every two chars, then lower cases them. For instance, the string
"Hello World" would be tokenized as "he", "ll", and so forth. When I
try to index some document with this analyzer, it seems that Lucene
does not take it into account at all. To make sure of that, I have
simply added a log entry inside my "createComponents" method, to check
when this method is called - and the method is never called during the
indexing process. Interestingly, when I use a query parser with my
custom analyzer, everything is fine - the createComponents method is
called as expected. To me, this means that I have somehow failed to
properly attach my analyzer to the index writer.

The following classes demonstrate my issue:
- IndexUtil: has methods to create the analyzer, open a
FSDirectory-based directory writer, and the corresponding reader,
- TwoLettersAnalyzer: my custom analyzer, which puts together my
custom tokenizer with a LowerCase filter,
- TwoLettersTokenizer: my custom tokenizer,
- TwoLettersTest: the main test class, with the standard "main()" method.

The console output generated by the program is as followed:
    Calling writeDocs()    <= I expected a call to createComponents()
right after this line
    Calling checkDocs()
    Calling TwoLettersAnalyzer.createComponents()
    Calling TwoLettersTokenizer.incrementToken()
    Calling TwoLettersTokenizer.incrementToken()
    Have we found our doc? false


Thank you very much for your time.

Kind regards,
Yep.




=================================================================
IndexUtil.java
=================================================================

package experiments;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.IOException;

public final class IndexUtil {

    private IndexUtil() {

    }

    private static final File indexDirectory = new
File("C:\\Users\\Elegie\\Documents\\_Programmation\\Tests\\LuceneWriterTest\\index");

    public static Analyzer createAnalyzer() {
        return new TwoLettersAnalyzer();
    }

    public static IndexWriter openWriter() throws IOException {
        IndexWriterConfig config = new
IndexWriterConfig(Version.LUCENE_40, createAnalyzer());
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        return new IndexWriter(FSDirectory.open(indexDirectory), config);
    }

    public static IndexReader openReader() throws IOException {
        return DirectoryReader.open(FSDirectory.open(indexDirectory));
    }

}


=================================================================
TwoLettersAnalyzer.java
=================================================================

package experiments;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.util.Version;

import java.io.Reader;

public class TwoLettersAnalyzer extends Analyzer {

    @Override
    protected TokenStreamComponents createComponents(String s, Reader reader) {
        System.out.println("Calling TwoLettersAnalyzer.createComponents()");
        Tokenizer source = new TwoLettersTokenizer(reader);
        TokenStream filter = new LowerCaseFilter(Version.LUCENE_40, source);
        return new TokenStreamComponents(source, filter);
    }

}


=================================================================
TwoLettersTokenizer.java
=================================================================

package experiments;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;
import java.io.Reader;

public class TwoLettersTokenizer extends Tokenizer {

    private static final int TWO_LETTERS = 2;

    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

    protected TwoLettersTokenizer(Reader input) {
        super(input);
    }

    @Override
    public final boolean incrementToken() throws IOException {
        System.out.println("Calling TwoLettersTokenizer.incrementToken()");
        StringBuilder builder = new StringBuilder();
        boolean success = readAhead(builder, TWO_LETTERS);
        if (success) {
            termAtt.setEmpty();
            termAtt.append(builder);
        }
        return success;
    }

    private boolean readAhead(StringBuilder builder, int ahead) throws
IOException {
        if (ahead <= 0) {
            return true;
        }
        int data = input.read();
        if (data != -1) {
            builder.append((char) data);
            return readAhead(builder, ahead - 1);
        }
        return false;
    }

}

=================================================================
TwoLettersTest.java
=================================================================

package experiments;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Version;

import java.io.IOException;

public class TwoLettersTest {

    private static final String FIELD_NAME = "content";
    private static final String TEST_CONTENT = "Hello, World!";
    private static final String TEST_SEARCH = "he"; //llo World!
    private static final int EXPECTED_MATCHED_DOCS = 1;

    public static void main(String[] args) throws IOException, ParseException {
        TwoLettersTest test = new TwoLettersTest();

        System.out.println("Calling writeDocs()");
        test.writeDocs();

        System.out.println("Calling checkDocs()");
        test.checkDocs();
    }

    public void writeDocs() throws IOException {
        IndexWriter writer = IndexUtil.openWriter();
        writer.addDocument(createDoc(TEST_CONTENT));
        writer.close();
    }

    public void checkDocs() throws ParseException, IOException {
        Query query = createQuery();
        IndexReader reader = IndexUtil.openReader();
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs results = searcher.search(query, EXPECTED_MATCHED_DOCS);
        System.out.println("Have we found our doc? " +
(results.scoreDocs.length == EXPECTED_MATCHED_DOCS));
        reader.close();
    }

    private Document createDoc(String content) {
        Document doc = new Document();
        StringField field = new StringField(FIELD_NAME, content,
Field.Store.YES);
        doc.add(field);
        return doc;
    }

    private Query createQuery() throws ParseException {
        String search = FIELD_NAME + ":" + TEST_SEARCH;
        QueryParser parser = new QueryParser(Version.LUCENE_40,
FIELD_NAME, IndexUtil.createAnalyzer());
        Query query = parser.parse(search);
        return query;
    }

}

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org

Custom Analyzer Not Called When Indexing

Reply via email to