Sorry, if you mean the java code then it's as below:
import java.io.File;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
public class Test {
/**
* @param args
*/
public static void main(String[] args) throws Exception{
doIndex("C:/data", "c:/temp");
BooleanQuery mainQuery= new BooleanQuery() ;
QueryParser qp= new QueryParser("contents", new CustomAnalyzer());
Query pQuery= qp.parse("\"text classification\"");
mainQuery.add(pQuery, BooleanClause.Occur.SHOULD);
pQuery= qp.parse("\"Information Extraction\"");
mainQuery.add(pQuery, BooleanClause.Occur.SHOULD);
IndexSearcher searcher = new IndexSearcher("c:/temp");
Hits hits = searcher.search(mainQuery);
if (hits.length()==0) {
System.out.println("No documents match ");
}
else {
for (int i=0;i<hits.length();i++){
String path = hits.doc(i).get("path");
Explanation ex = searcher.explain(mainQuery, hits.id(i));
System.out.println(path +"\n");
System.out.println(ex.toString());
}
}
}
private static void doIndex(String docsDir, String indexDir) throws
Exception {
File indexDirectory= new File(indexDir);
File docsDirectory= new File(docsDir);
IndexWriter writer= new IndexWriter(indexDirectory, new
StandardAnalyzer(),true);
indexDirectory(writer, docsDirectory);
writer.optimize();
writer.close();
}
private static void indexFile(IndexWriter writer, File file) throws
Exception {
if (file.isHidden()|| !file.exists() || !file.canRead()) return;
String fileContents= getTextFromPDF(file);
Document doc= new Document();
Field field1= new Field("contents", fileContents, Field.Store.YES,
Field.Index.TOKENIZED);
doc.add(field1);
Field field2= new Field ("path", file.getAbsolutePath(),
Field.Store.YES, Field.Index.UN_TOKENIZED);
doc.add(field2);
writer.addDocument(doc);
doc=null;
}
private static void indexDirectory(IndexWriter writer, File
docsDirectory) throws Exception {
File[] fileList = docsDirectory.listFiles();
for (int i=0; i< fileList.length;i++){
File file= fileList[i];
if (file.isDirectory()){
indexDirectory(writer, file);
}
else if (file.isFile() && file.getName().endsWith(".pdf")){
indexFile(writer, file);
}
}
}
private static String getTextFromPDF(File pdfFile) {
// extract PDF document's textual content
String docText = null;
PDDocument pdDoc;
try {
pdDoc= PDDocument.load(pdfFile);
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(pdDoc);
pdDoc.close();
}
catch (Exception e) {
System.out.println(pdfFile.getAbsolutePath());
}
return docText;
}
}
On Nov 27, 2007 9:59 PM, Erick Erickson <[EMAIL PROTECTED]> wrote:
> Attachments often do not come through, at least they aren't visible to
> me using g-mail. So you might want to re-send them in-line.
>
> But another thing you can do is get a copy of luke and examine
> your index to see if the actual contents of doc1 and doc2 are what
> you expect. You can even run queries through it (but watch to insure
> that you're using the correct analyzer) and see what is returned....
>
> Best
> Erick
>
> On Nov 27, 2007 3:54 PM, Ng Vinny <[EMAIL PROTECTED]> wrote:
>
> > Hi all,
> >
> > I am having a problem with Lucene 2.2.0 with regard to the contents of
> the
> > Explanation objects after a PhraseQuery search. I indexed two documents
> doc1
> > and doc2 and then issue an OR Boolean query consisting of two
> PhraseQuery
> > pq1 and pq2.
> >
> > Apparently, the details of the Explanation object for doc1 show that pq1
> > has positive tf value even though it doesn't appear in doc1. The tf
> value is
> > exactly the same as that of the tf value for pq1 in doc2 (pq1 does
> appear in
> > doc2).
> >
> > The code is attached (the sample pdf files cannot be attached due to
> size
> > restriction on the list).
> >
> > Please help to shed some light on this.
> >
> > Thank you very much
> > Ng Vinny
> >
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: [EMAIL PROTECTED]
> > For additional commands, e-mail: [EMAIL PROTECTED]
> >
>