Sorry, if you mean the java code then it's as below:

import java.io.File;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class Test {

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception{

        doIndex("C:/data", "c:/temp");

        BooleanQuery mainQuery= new BooleanQuery() ;
        QueryParser qp= new QueryParser("contents", new CustomAnalyzer());

        Query pQuery= qp.parse("\"text classification\"");
        mainQuery.add(pQuery, BooleanClause.Occur.SHOULD);

        pQuery= qp.parse("\"Information Extraction\"");
        mainQuery.add(pQuery, BooleanClause.Occur.SHOULD);

        IndexSearcher searcher = new IndexSearcher("c:/temp");
        Hits hits = searcher.search(mainQuery);
        if (hits.length()==0) {
            System.out.println("No documents match ");
        }
        else {
            for (int i=0;i<hits.length();i++){
                String path = hits.doc(i).get("path");
                Explanation ex = searcher.explain(mainQuery, hits.id(i));
                System.out.println(path +"\n");
                System.out.println(ex.toString());

            }
        }
    }


    private static void doIndex(String docsDir, String indexDir) throws
Exception {

        File indexDirectory= new File(indexDir);
        File docsDirectory= new File(docsDir);
        IndexWriter writer= new IndexWriter(indexDirectory, new
StandardAnalyzer(),true);
        indexDirectory(writer, docsDirectory);
        writer.optimize();
        writer.close();
    }


    private static void indexFile(IndexWriter writer, File file) throws
Exception {
        if (file.isHidden()|| !file.exists() || !file.canRead()) return;

        String fileContents= getTextFromPDF(file);
        Document doc= new Document();
        Field field1= new Field("contents", fileContents, Field.Store.YES,
Field.Index.TOKENIZED);
        doc.add(field1);
        Field field2= new Field ("path", file.getAbsolutePath(),
Field.Store.YES, Field.Index.UN_TOKENIZED);
        doc.add(field2);

        writer.addDocument(doc);
        doc=null;
    }


    private static void indexDirectory(IndexWriter writer, File
docsDirectory) throws Exception {

        File[] fileList = docsDirectory.listFiles();

        for (int i=0; i< fileList.length;i++){
            File file= fileList[i];
            if (file.isDirectory()){
                indexDirectory(writer, file);
            }
            else if (file.isFile() && file.getName().endsWith(".pdf")){
                indexFile(writer, file);
            }
        }
    }


    private static String getTextFromPDF(File pdfFile) {
        // extract PDF document's textual content
        String docText = null;
        PDDocument pdDoc;
        try {
            pdDoc= PDDocument.load(pdfFile);
            PDFTextStripper stripper = new PDFTextStripper();
            docText = stripper.getText(pdDoc);
            pdDoc.close();
        }
        catch (Exception e) {
            System.out.println(pdfFile.getAbsolutePath());
        }

        return docText;
    }

}


On Nov 27, 2007 9:59 PM, Erick Erickson <[EMAIL PROTECTED]> wrote:

> Attachments often do not come through, at least they aren't visible to
> me using g-mail. So you might want to re-send them in-line.
>
> But another thing you can do is get a copy of luke and examine
> your index to see if the actual contents of doc1 and doc2 are what
> you expect. You can even run queries through it (but watch to insure
> that you're using the correct analyzer) and see what is returned....
>
> Best
> Erick
>
> On Nov 27, 2007 3:54 PM, Ng Vinny <[EMAIL PROTECTED]> wrote:
>
> > Hi all,
> >
> > I am having a problem with Lucene 2.2.0 with regard to the contents of
> the
> > Explanation objects after a PhraseQuery search. I indexed two documents
> doc1
> > and doc2 and then issue an OR Boolean query consisting of two
> PhraseQuery
> > pq1 and pq2.
> >
> > Apparently, the details of the Explanation object for doc1 show that pq1
> > has positive tf value even though it doesn't appear in doc1. The tf
> value is
> > exactly the same as that of the tf value for pq1 in doc2 (pq1 does
> appear in
> > doc2).
> >
> > The code is attached (the sample pdf files cannot be attached due to
> size
> > restriction on the list).
> >
> > Please help to shed some light on this.
> >
> > Thank you very much
> > Ng Vinny
> >
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: [EMAIL PROTECTED]
> > For additional commands, e-mail: [EMAIL PROTECTED]
> >
>

Reply via email to