Fwd: Search Test file

Grant Ingersoll Sat, 03 Jan 2009 17:19:50 -0800

Hi Amin,

I see a couple of issues with your program below, and one that is thecause of the problem of not finding "amin" as a query term.


When you construct your IndexWriter, you are doing:

IndexWriter indexWriter = newIndexWriter(getDirectory(),getAnalyzer(),newIndexWriter.MaxFieldLength(2));

The MaxFieldLength parameter specifies the maximum number of tokensallowed in a Field. Everything else after that is dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html


Also,

TopDocs topDocs = multiSearcher.search(query,BooleanQuery.getMaxClauseCount());

strikes me as really odd. Why are you passing in the max clause countas the number of results you want returned?


Cheers,
Grant



Begin forwarded message:

From: "ami...@gmail.com" <ami...@gmail.com>
Date: January 3, 2009 3:24:52 PM EST
To: gsing...@apache.org
Subject: Search Test file

I've shared a document with you called "Search Test file":
http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj

It's not an attachment -- it's stored online at Google Docs. To openthis document, just click the link above.

---

Hi

I have uploaded the test file at google docs. It is currently a txtfile but if you change the extension to .java it should work.


package com.amin.app.lucene.search.impl;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.ant.DocumentHandler;
import org.apache.lucene.ant.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searchable;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import com.amin.app.lucene.util.WorkItem.IndexerType;

public class SearchTest {

private File rtfFile = null;
private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";

@Before
public void setUp() throws Exception {

InputStream inputStream =this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);

rtfFile = new File(RTF_FILE_NAME);
convertInputStreamToFile(inputStream, rtfFile);
}



@Test

public void testCanCreateLuceneDocumentForRTFDocument() throwsException {

JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
Document document = builtInRTFHandler.getDocument(rtfFile);
assertNotNull(document);
String value = document.get(FieldNameEnum.BODY.getDescription());
assertNotNull(value);
assertNotSame("", value);
assertTrue(value.contains("Amin Mohammed-Coleman"));

assertTrue(value.contains("This is a test rtf document that will beindexed."));

String path = document.get(FieldNameEnum.PATH.getDescription());
assertNotNull(path);
assertTrue(path.contains(".rtf"));
String fileName = document.get(FieldNameEnum.NAME.getDescription());
assertNotNull(fileName);
assertEquals(RTF_FILE_NAME, fileName);

assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),document.get(FieldNameEnum.TYPE.getDescription()));


}



@Test
public void testCanSearchRtfDocument() throws Exception {
JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
Document document = builtInRTFHandler.getDocument(rtfFile);

IndexWriter indexWriter = newIndexWriter(getDirectory(),getAnalyzer(),newIndexWriter.MaxFieldLength(2));

try {
indexWriter.addDocument(document);
commitAndCloseWriter(indexWriter);
} catch (CorruptIndexException e) {
throw new IllegalStateException(e);
} catch (IOException e) {
throw new IllegalStateException(e);
}

//I plan to use other searchers later
IndexSearcher indexSearcher = new IndexSearcher(getDirectory());

MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]{indexSearcher});QueryParser queryParser = new MultiFieldQueryParser(new String[]{FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());

Query query = queryParser.parse("amin");

TopDocs topDocs = multiSearcher.search(query,BooleanQuery.getMaxClauseCount());

assertNotNull(topDocs);
assertEquals(1, topDocs.totalHits);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
assertNotNull(documentFromSearch);

String bodyText =documentFromSearch.get(FieldNameEnum.BODY.getDescription());

assertNotNull(bodyText);
assertNotSame("", bodyText);
assertTrue(bodyText.contains("Amin Mohammed-Coleman"));

assertTrue(bodyText.contains("This is a test rtf document that willbe indexed."));


}
multiSearcher.close();

}

@After
public void tearDown() throws Exception {
rtfFile.delete();

if (getDirectory().list() != null && getDirectory().list().length >0) {

IndexReader reader = IndexReader.open(getDirectory());
for(int i = 0; i < reader.maxDoc();i++) {
reader.deleteDocument(i);
}
reader.close();
}
}

private void commitAndCloseWriter(IndexWriter indexWriter) throwsCorruptIndexException,IOException {

indexWriter.commit();
indexWriter.close();
}


public Directory getDirectory() throws IOException {
return FSDirectory.getDirectory("/tmp/lucene/rtf");
}

public Analyzer getAnalyzer() {
return new StandardAnalyzer();
}

private static void convertInputStreamToFile(InputStreaminputStream, File file) {

try
    {
    OutputStream out=new FileOutputStream(file);
    byte buf[]=new byte[1024];
    int len;
    while((len=inputStream.read(buf))>0)
    out.write(buf,0,len);
    out.close();
    inputStream.close();

    }catch (IOException e){
    throw new IllegalStateException(e);
    }
}
private static class JavaBuiltInRTFHandler implements DocumentHandler{

public Document getDocument(File file) throwsDocumentHandlerException {

String bodyText = null;
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
try {
InputStream inputStream = new FileInputStream(file);
new RTFEditorKit().read(inputStream, styledDoc, 0);
bodyText = styledDoc.getText(0, styledDoc.getLength());
} catch (IOException ioex) {
throw new IllegalStateException(ioex);
} catch (BadLocationException e) {
throw new IllegalArgumentException(e);
}
//create Document object using body
if (bodyText != null) {
Document document = new Document();
String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
trimmedBodyText = trimmedBodyText.replaceAll("\n", "");

Field field = newField(FieldNameEnum.BODY.getDescription(),trimmedBodyText,Field.Store.YES, Field.Index.ANALYZED);

document.add(field);

String pathToFile = file.getPath();

Field pathToFileField = newField(FieldNameEnum.PATH.getDescription(),pathToFile,Field.Store.YES, Field.Index.NOT_ANALYZED);

document.add(pathToFileField);

String fileName = file.getName();

Field fileNameField = newField(FieldNameEnum.NAME.getDescription(),fileName, Field.Store.YES,Field.Index.NOT_ANALYZED);

document.add(fileNameField);

Field typeField = newField(FieldNameEnum.TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),Field.Store.YES, Field.Index.NOT_ANALYZED);

document.add(typeField);

String summary = bodyText.substring(0, 10);

Field summaryField = newField(FieldNameEnum.SUMMARY.getDescription(),summary,Field.Store.YES, Field.Index.NOT_ANALYZED);

document.add(summaryField);

return document;
}
return null;
}
}

private static class WorkItem {

public enum WorkItemEvent {
ADD,
UPDATE,
DELETE;
}

public enum IndexerType {
RTF_INDEXER,
PDF_INDEXER,
XML_INDEXER,
PLAIN_TEXT_INDEXER,
MS_WORD_INDEXER,
MS_EXCEL_INDEXER,
MS_POWERPOINT_INDEXER;
}


private final Document workLoad;

private final WorkItemEvent workItemEvent;

private final IndexerType indexerType;

public WorkItem(final Document workLoad, final WorkItemEventworkItemEvent) {

this.workLoad = workLoad;
this.workItemEvent = workItemEvent;
String type = this.workLoad.get("type");
this.indexerType = IndexerType.valueOf(type);
}

public IndexerType getIndexerType() {
return indexerType;
}

public Document getWorkLoad() {
return workLoad;
}

public WorkItemEvent getWorkItemEvent() {
return workItemEvent;
}
}

private enum FieldNameEnum {

AUTHOR("author"),
BODY("body"),
TITLE("title"),
SUBJECT("subject"),
KEYWORDS("keywords"),
PATH("path"), NAME ("name"),
TYPE("type"),
ID ("id"),
SUMMARY ("summary");

private final String description;

private FieldNameEnum(final String description) {
this.description = description;
}

public String getDescription() {
return this.description;
}
}
}


--------------------------
Grant Ingersoll

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ

Fwd: Search Test file

Reply via email to