From: "ami...@gmail.com" <ami...@gmail.com>
Date: January 3, 2009 3:24:52 PM EST
To: gsing...@apache.org
Subject: Search Test file
I've shared a document with you called "Search Test file":
http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
It's not an attachment -- it's stored online at Google Docs. To open
this document, just click the link above.
---
Hi
I have uploaded the test file at google docs. It is currently a txt
file but if you change the extension to .java it should work.
package com.amin.app.lucene.search.impl;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.ant.DocumentHandler;
import org.apache.lucene.ant.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searchable;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.amin.app.lucene.util.WorkItem.IndexerType;
public class SearchTest {
private File rtfFile = null;
private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";
@Before
public void setUp() throws Exception {
InputStream inputStream =
this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
rtfFile = new File(RTF_FILE_NAME);
convertInputStreamToFile(inputStream, rtfFile);
}
@Test
public void testCanCreateLuceneDocumentForRTFDocument() throws
Exception {
JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
Document document = builtInRTFHandler.getDocument(rtfFile);
assertNotNull(document);
String value = document.get(FieldNameEnum.BODY.getDescription());
assertNotNull(value);
assertNotSame("", value);
assertTrue(value.contains("Amin Mohammed-Coleman"));
assertTrue(value.contains("This is a test rtf document that will be
indexed."));
String path = document.get(FieldNameEnum.PATH.getDescription());
assertNotNull(path);
assertTrue(path.contains(".rtf"));
String fileName = document.get(FieldNameEnum.NAME.getDescription());
assertNotNull(fileName);
assertEquals(RTF_FILE_NAME, fileName);
assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),
document.get(FieldNameEnum.TYPE.getDescription()));
}
@Test
public void testCanSearchRtfDocument() throws Exception {
JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
Document document = builtInRTFHandler.getDocument(rtfFile);
IndexWriter indexWriter = new
IndexWriter(getDirectory(),getAnalyzer(),new
IndexWriter.MaxFieldLength(2));
try {
indexWriter.addDocument(document);
commitAndCloseWriter(indexWriter);
} catch (CorruptIndexException e) {
throw new IllegalStateException(e);
} catch (IOException e) {
throw new IllegalStateException(e);
}
//I plan to use other searchers later
IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]
{indexSearcher});
QueryParser queryParser = new MultiFieldQueryParser(new String[]
{FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
Query query = queryParser.parse("amin");
TopDocs topDocs = multiSearcher.search(query,
BooleanQuery.getMaxClauseCount());
assertNotNull(topDocs);
assertEquals(1, topDocs.totalHits);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
assertNotNull(documentFromSearch);
String bodyText =
documentFromSearch.get(FieldNameEnum.BODY.getDescription());
assertNotNull(bodyText);
assertNotSame("", bodyText);
assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
assertTrue(bodyText.contains("This is a test rtf document that will
be indexed."));
}
multiSearcher.close();
}
@After
public void tearDown() throws Exception {
rtfFile.delete();
if (getDirectory().list() != null && getDirectory().list().length >
0) {
IndexReader reader = IndexReader.open(getDirectory());
for(int i = 0; i < reader.maxDoc();i++) {
reader.deleteDocument(i);
}
reader.close();
}
}
private void commitAndCloseWriter(IndexWriter indexWriter) throws
CorruptIndexException,IOException {
indexWriter.commit();
indexWriter.close();
}
public Directory getDirectory() throws IOException {
return FSDirectory.getDirectory("/tmp/lucene/rtf");
}
public Analyzer getAnalyzer() {
return new StandardAnalyzer();
}
private static void convertInputStreamToFile(InputStream
inputStream, File file) {
try
{
OutputStream out=new FileOutputStream(file);
byte buf[]=new byte[1024];
int len;
while((len=inputStream.read(buf))>0)
out.write(buf,0,len);
out.close();
inputStream.close();
}catch (IOException e){
throw new IllegalStateException(e);
}
}
private static class JavaBuiltInRTFHandler implements DocumentHandler{
public Document getDocument(File file) throws
DocumentHandlerException {
String bodyText = null;
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
try {
InputStream inputStream = new FileInputStream(file);
new RTFEditorKit().read(inputStream, styledDoc, 0);
bodyText = styledDoc.getText(0, styledDoc.getLength());
} catch (IOException ioex) {
throw new IllegalStateException(ioex);
} catch (BadLocationException e) {
throw new IllegalArgumentException(e);
}
//create Document object using body
if (bodyText != null) {
Document document = new Document();
String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
Field field = new
Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,
Field.Store.YES, Field.Index.ANALYZED);
document.add(field);
String pathToFile = file.getPath();
Field pathToFileField = new
Field(FieldNameEnum.PATH.getDescription(),pathToFile,
Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(pathToFileField);
String fileName = file.getName();
Field fileNameField = new
Field(FieldNameEnum.NAME.getDescription(),fileName, Field.Store.YES,
Field.Index.NOT_ANALYZED);
document.add(fileNameField);
Field typeField = new
Field
(FieldNameEnum.TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),
Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(typeField);
String summary = bodyText.substring(0, 10);
Field summaryField = new
Field(FieldNameEnum.SUMMARY.getDescription(),summary,
Field.Store.YES, Field.Index.NOT_ANALYZED);
document.add(summaryField);
return document;
}
return null;
}
}
private static class WorkItem {
public enum WorkItemEvent {
ADD,
UPDATE,
DELETE;
}
public enum IndexerType {
RTF_INDEXER,
PDF_INDEXER,
XML_INDEXER,
PLAIN_TEXT_INDEXER,
MS_WORD_INDEXER,
MS_EXCEL_INDEXER,
MS_POWERPOINT_INDEXER;
}
private final Document workLoad;
private final WorkItemEvent workItemEvent;
private final IndexerType indexerType;
public WorkItem(final Document workLoad, final WorkItemEvent
workItemEvent) {
this.workLoad = workLoad;
this.workItemEvent = workItemEvent;
String type = this.workLoad.get("type");
this.indexerType = IndexerType.valueOf(type);
}
public IndexerType getIndexerType() {
return indexerType;
}
public Document getWorkLoad() {
return workLoad;
}
public WorkItemEvent getWorkItemEvent() {
return workItemEvent;
}
}
private enum FieldNameEnum {
AUTHOR("author"),
BODY("body"),
TITLE("title"),
SUBJECT("subject"),
KEYWORDS("keywords"),
PATH("path"), NAME ("name"),
TYPE("type"),
ID ("id"),
SUMMARY ("summary");
private final String description;
private FieldNameEnum(final String description) {
this.description = description;
}
public String getDescription() {
return this.description;
}
}
}