Hello, this is BM25 algorithm I implement in Lucene. it doen't work because I have compaired my results with the results of MG4J (with the same documents set)
I don't know if I have a wrong formule or there are another mistake Could you help me ? -------------------------------------------------------------------------------------------------------------------------------- public class BM25Scorer extends Scorer { private final static double EPSILON_SCORE = 1.000000082240371E-9; private final static double DEFAULT_K1 = 0.75d; private final static double DEFAULT_B = 0.95d; private double b = DEFAULT_B; private double k1 = DEFAULT_K1; private IndexReader reader; private Term term; private Hits hits; private int position; // document position in hits private IndexSearcher searcher; private int cooc = 0; // How many times a term appears in the document private float idf; public float score() throws IOException { TermFreqVector tfv = reader.getTermFreqVector( hits.id(position), term.field() ); String[] terms = tfv.getTerms(); int[] freqs = tfv.getTermFrequencies(); for (int i = 0 ; i < terms.length ; i++) { if( terms[i].equalsIgnoreCase(term.text()) ){ cooc = freqs[i]; } } idf = searcher.getSimilarity().idf(term, searcher); Document document = (Document)hits.doc(position); String[] values = document.getValues("DOCUMENT_LENGTH"); // document length is a field of my index long docLength = Long.valueOf(values[0]).longValue(); // document lenght (number of words) long averageLength = 200; double loga = Math.max( EPSILON_SCORE, new Float(idf ).doubleValue()); double score = ( loga * (k1 + 1) * cooc ) / (cooc + k1*( (1-b) + (b*docLength/averageLength) ) ); return new Float(score).floatValue(); }