This is my program to calculate TF-IDF value for a document in a collection of documents. This is working fine, but takes lot of time when calculating the "IDF" values (finding the no of documents which contains particular term).
Is there a more efficient way of finding the no of documents which contains a particular term? freq = termsFreq.getTermFrequencies(); terms = termsFreq.getTerms(); int noOfTerms = terms.length; score = new float[noOfTerms]; DefaultSimilarity simi = new DefaultSimilarity(); for (i = 0; i < noOfTerms; i++) { int noofDocsContainTerm = noOfDocsContainTerm(terms[i]); float tf = simi.tf(freq[i]); float idf = simi.idf(noofDocsContainTerm, noOfDocs); score[i] = tf * idf ; } //// public int noOfDocsContainTerm(String querystr) throws CorruptIndexException, IOException, ParseException{ QueryParser qp=new QueryParser(Version.LUCENE_35, "docuemnt", new StandardAnalyzer(Version.LUCENE_35)); Query q=qp.parse(querystr); int hitsPerPage = docNames.length; //minumum number or search results IndexSearcher searcher = new IndexSearcher(ramMemDir, true); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; return hits.length; } -- Regards Kasun Perera