Hi Amin,

I see a couple of issues with your program below, and one that is the cause of the problem of not finding "amin" as a query term.

When you construct your IndexWriter, you are doing:
IndexWriter indexWriter = new IndexWriter(getDirectory(),getAnalyzer(),new IndexWriter.MaxFieldLength(2));

The MaxFieldLength parameter specifies the maximum number of tokens allowed in a Field. Everything else after that is dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength) and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html

Also,
TopDocs topDocs = multiSearcher.search(query, BooleanQuery.getMaxClauseCount());

strikes me as really odd. Why are you passing in the max clause count as the number of results you want returned?

Cheers,
Grant



Begin forwarded message:

From: "ami...@gmail.com" <ami...@gmail.com>
Date: January 3, 2009 3:24:52 PM EST
To: gsing...@apache.org
Subject: Search Test file

I've shared a document with you called "Search Test file":
http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj

It's not an attachment -- it's stored online at Google Docs. To open this document, just click the link above.
---

Hi

I have uploaded the test file at google docs. It is currently a txt file but if you change the extension to .java it should work.

package com.amin.app.lucene.search.impl;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.ant.DocumentHandler;
import org.apache.lucene.ant.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searchable;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import com.amin.app.lucene.util.WorkItem.IndexerType;

public class SearchTest {

private File rtfFile = null;
private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";

@Before
public void setUp() throws Exception {
InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
rtfFile = new File(RTF_FILE_NAME);
convertInputStreamToFile(inputStream, rtfFile);
}



@Test
public void testCanCreateLuceneDocumentForRTFDocument() throws Exception {
JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
Document document = builtInRTFHandler.getDocument(rtfFile);
assertNotNull(document);
String value = document.get(FieldNameEnum.BODY.getDescription());
assertNotNull(value);
assertNotSame("", value);
assertTrue(value.contains("Amin Mohammed-Coleman"));
assertTrue(value.contains("This is a test rtf document that will be indexed."));
String path = document.get(FieldNameEnum.PATH.getDescription());
assertNotNull(path);
assertTrue(path.contains(".rtf"));
String fileName = document.get(FieldNameEnum.NAME.getDescription());
assertNotNull(fileName);
assertEquals(RTF_FILE_NAME, fileName);
assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(), document.get(FieldNameEnum.TYPE.getDescription()));

}



@Test
public void testCanSearchRtfDocument() throws Exception {
JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
Document document = builtInRTFHandler.getDocument(rtfFile);
IndexWriter indexWriter = new IndexWriter(getDirectory(),getAnalyzer(),new IndexWriter.MaxFieldLength(2));
try {
indexWriter.addDocument(document);
commitAndCloseWriter(indexWriter);
} catch (CorruptIndexException e) {
throw new IllegalStateException(e);
} catch (IOException e) {
throw new IllegalStateException(e);
}

//I plan to use other searchers later
IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
MultiSearcher multiSearcher = new MultiSearcher(new Searchable[] {indexSearcher}); QueryParser queryParser = new MultiFieldQueryParser(new String[] {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
Query query = queryParser.parse("amin");
TopDocs topDocs = multiSearcher.search(query, BooleanQuery.getMaxClauseCount());
assertNotNull(topDocs);
assertEquals(1, topDocs.totalHits);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
assertNotNull(documentFromSearch);
String bodyText = documentFromSearch.get(FieldNameEnum.BODY.getDescription());
assertNotNull(bodyText);
assertNotSame("", bodyText);
assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
assertTrue(bodyText.contains("This is a test rtf document that will be indexed."));

}
multiSearcher.close();

}

@After
public void tearDown() throws Exception {
rtfFile.delete();
if (getDirectory().list() != null && getDirectory().list().length > 0) {
IndexReader reader = IndexReader.open(getDirectory());
for(int i = 0; i < reader.maxDoc();i++) {
reader.deleteDocument(i);
}
reader.close();
}
}

private void commitAndCloseWriter(IndexWriter indexWriter) throws CorruptIndexException,IOException {
indexWriter.commit();
indexWriter.close();
}


public Directory getDirectory() throws IOException {
return FSDirectory.getDirectory("/tmp/lucene/rtf");
}

public Analyzer getAnalyzer() {
return new StandardAnalyzer();
}
private static void convertInputStreamToFile(InputStream inputStream, File file) {
try
    {
    OutputStream out=new FileOutputStream(file);
    byte buf[]=new byte[1024];
    int len;
    while((len=inputStream.read(buf))>0)
    out.write(buf,0,len);
    out.close();
    inputStream.close();

    }catch (IOException e){
    throw new IllegalStateException(e);
    }
}
private static class JavaBuiltInRTFHandler implements DocumentHandler{

public Document getDocument(File file) throws DocumentHandlerException {
String bodyText = null;
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
try {
InputStream inputStream = new FileInputStream(file);
new RTFEditorKit().read(inputStream, styledDoc, 0);
bodyText = styledDoc.getText(0, styledDoc.getLength());
} catch (IOException ioex) {
throw new IllegalStateException(ioex);
} catch (BadLocationException e) {
throw new IllegalArgumentException(e);
}
//create Document object using body
if (bodyText != null) {
Document document = new Document();
String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
Field field = new Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText, Field.Store.YES, Field.Index.ANALYZED);
document.add(field);

String pathToFile = file.getPath();
Field pathToFileField = new Field(FieldNameEnum.PATH.getDescription(),pathToFile, Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(pathToFileField);

String fileName = file.getName();
Field fileNameField = new Field(FieldNameEnum.NAME.getDescription(),fileName, Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(fileNameField);

Field typeField = new Field (FieldNameEnum.TYPE.getDescription(),IndexerType.RTF_INDEXER.name(), Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(typeField);

String summary = bodyText.substring(0, 10);

Field summaryField = new Field(FieldNameEnum.SUMMARY.getDescription(),summary, Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(summaryField);

return document;
}
return null;
}
}

private static class WorkItem {

public enum WorkItemEvent {
ADD,
UPDATE,
DELETE;
}

public enum IndexerType {
RTF_INDEXER,
PDF_INDEXER,
XML_INDEXER,
PLAIN_TEXT_INDEXER,
MS_WORD_INDEXER,
MS_EXCEL_INDEXER,
MS_POWERPOINT_INDEXER;
}


private final Document workLoad;

private final WorkItemEvent workItemEvent;

private final IndexerType indexerType;


public WorkItem(final Document workLoad, final WorkItemEvent workItemEvent) {
this.workLoad = workLoad;
this.workItemEvent = workItemEvent;
String type = this.workLoad.get("type");
this.indexerType = IndexerType.valueOf(type);
}

public IndexerType getIndexerType() {
return indexerType;
}

public Document getWorkLoad() {
return workLoad;
}

public WorkItemEvent getWorkItemEvent() {
return workItemEvent;
}
}

private enum FieldNameEnum {

AUTHOR("author"),
BODY("body"),
TITLE("title"),
SUBJECT("subject"),
KEYWORDS("keywords"),
PATH("path"), NAME ("name"),
TYPE("type"),
ID ("id"),
SUMMARY ("summary");

private final String description;

private FieldNameEnum(final String description) {
this.description = description;
}

public String getDescription() {
return this.description;
}
}
}

--------------------------
Grant Ingersoll

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ










Reply via email to