Hello, this is BM25 algorithm I implement in Lucene.

it doen't work because I have compaired my results with the results of MG4J
(with the same documents set)

I don't know if I have a wrong formule or there are another mistake

Could you help me ?

--------------------------------------------------------------------------------------------------------------------------------

public class BM25Scorer extends Scorer {

   private final static double EPSILON_SCORE = 1.000000082240371E-9;
   private final static double DEFAULT_K1 = 0.75d;
   private final static double DEFAULT_B = 0.95d;
   private double b = DEFAULT_B;
   private double k1 = DEFAULT_K1;

   private IndexReader reader;
   private Term term;
   private Hits hits;
   private int position;   // document position in hits
   private IndexSearcher searcher;

   private int cooc = 0;    // How many times a term appears in the
document
   private float idf;


   public float score() throws IOException {
       TermFreqVector tfv = reader.getTermFreqVector( hits.id(position),
term.field() );

       String[] terms = tfv.getTerms();
       int[] freqs = tfv.getTermFrequencies();
       for (int i = 0 ; i < terms.length ; i++) {
           if( terms[i].equalsIgnoreCase(term.text()) ){
               cooc = freqs[i];
           }
       }

       idf = searcher.getSimilarity().idf(term, searcher);

       Document document = (Document)hits.doc(position);
       String[] values = document.getValues("DOCUMENT_LENGTH");  //
document length is a field of my index

       long docLength = Long.valueOf(values[0]).longValue();  // document
lenght (number of words)
       long averageLength = 200;

       double loga =  Math.max( EPSILON_SCORE, new Float(idf
).doubleValue());
       double score = ( loga * (k1 + 1) * cooc ) / (cooc + k1*( (1-b) +
(b*docLength/averageLength) ) );

       return new Float(score).floatValue();
   }

Reply via email to