Sorry, if you mean the java code then it's as below: import java.io.File;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; public class Test { /** * @param args */ public static void main(String[] args) throws Exception{ doIndex("C:/data", "c:/temp"); BooleanQuery mainQuery= new BooleanQuery() ; QueryParser qp= new QueryParser("contents", new CustomAnalyzer()); Query pQuery= qp.parse("\"text classification\""); mainQuery.add(pQuery, BooleanClause.Occur.SHOULD); pQuery= qp.parse("\"Information Extraction\""); mainQuery.add(pQuery, BooleanClause.Occur.SHOULD); IndexSearcher searcher = new IndexSearcher("c:/temp"); Hits hits = searcher.search(mainQuery); if (hits.length()==0) { System.out.println("No documents match "); } else { for (int i=0;i<hits.length();i++){ String path = hits.doc(i).get("path"); Explanation ex = searcher.explain(mainQuery, hits.id(i)); System.out.println(path +"\n"); System.out.println(ex.toString()); } } } private static void doIndex(String docsDir, String indexDir) throws Exception { File indexDirectory= new File(indexDir); File docsDirectory= new File(docsDir); IndexWriter writer= new IndexWriter(indexDirectory, new StandardAnalyzer(),true); indexDirectory(writer, docsDirectory); writer.optimize(); writer.close(); } private static void indexFile(IndexWriter writer, File file) throws Exception { if (file.isHidden()|| !file.exists() || !file.canRead()) return; String fileContents= getTextFromPDF(file); Document doc= new Document(); Field field1= new Field("contents", fileContents, Field.Store.YES, Field.Index.TOKENIZED); doc.add(field1); Field field2= new Field ("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.UN_TOKENIZED); doc.add(field2); writer.addDocument(doc); doc=null; } private static void indexDirectory(IndexWriter writer, File docsDirectory) throws Exception { File[] fileList = docsDirectory.listFiles(); for (int i=0; i< fileList.length;i++){ File file= fileList[i]; if (file.isDirectory()){ indexDirectory(writer, file); } else if (file.isFile() && file.getName().endsWith(".pdf")){ indexFile(writer, file); } } } private static String getTextFromPDF(File pdfFile) { // extract PDF document's textual content String docText = null; PDDocument pdDoc; try { pdDoc= PDDocument.load(pdfFile); PDFTextStripper stripper = new PDFTextStripper(); docText = stripper.getText(pdDoc); pdDoc.close(); } catch (Exception e) { System.out.println(pdfFile.getAbsolutePath()); } return docText; } } On Nov 27, 2007 9:59 PM, Erick Erickson <[EMAIL PROTECTED]> wrote: > Attachments often do not come through, at least they aren't visible to > me using g-mail. So you might want to re-send them in-line. > > But another thing you can do is get a copy of luke and examine > your index to see if the actual contents of doc1 and doc2 are what > you expect. You can even run queries through it (but watch to insure > that you're using the correct analyzer) and see what is returned.... > > Best > Erick > > On Nov 27, 2007 3:54 PM, Ng Vinny <[EMAIL PROTECTED]> wrote: > > > Hi all, > > > > I am having a problem with Lucene 2.2.0 with regard to the contents of > the > > Explanation objects after a PhraseQuery search. I indexed two documents > doc1 > > and doc2 and then issue an OR Boolean query consisting of two > PhraseQuery > > pq1 and pq2. > > > > Apparently, the details of the Explanation object for doc1 show that pq1 > > has positive tf value even though it doesn't appear in doc1. The tf > value is > > exactly the same as that of the tf value for pq1 in doc2 (pq1 does > appear in > > doc2). > > > > The code is attached (the sample pdf files cannot be attached due to > size > > restriction on the list). > > > > Please help to shed some light on this. > > > > Thank you very much > > Ng Vinny > > > > > > --------------------------------------------------------------------- > > To unsubscribe, e-mail: [EMAIL PROTECTED] > > For additional commands, e-mail: [EMAIL PROTECTED] > > >