Repository: mahout Updated Branches: refs/heads/master d5ea1f1be -> 9b169e7e7
Fixed incorrect MLlibTFIDF IDF calculation Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/9b169e7e Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/9b169e7e Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/9b169e7e Branch: refs/heads/master Commit: 9b169e7e710f1a4883a15b652ad8ece86a88960f Parents: d5ea1f1 Author: Andrew Palumbo <[email protected]> Authored: Fri Mar 13 18:47:31 2015 -0400 Committer: Andrew Palumbo <[email protected]> Committed: Fri Mar 13 18:47:31 2015 -0400 ---------------------------------------------------------------------- .../src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala | 8 ++++---- .../scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/9b169e7e/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala index 5b78e18..c75ff20 100644 --- a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala +++ b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala @@ -36,7 +36,7 @@ class TFIDF extends TermWeight { * * Lucene 4.6's DefaultSimilarity TF-IDF calculation uses the formula: * - * sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0) + * sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0) * * Note: this is consistent with the MapReduce seq2sparse implementation of TF-IDF weights * and is slightly different from Spark MLlib's TD-IDF calculation which is implemented as: @@ -52,7 +52,7 @@ class TFIDF extends TermWeight { def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = { // Lucene 4.6 DefaultSimilarity's TF-IDF is implemented as: - // sqrt(tf) * (log(numDocs / (df+1)) + 1) + // sqrt(tf) * (log(numDocs / (df + 1)) + 1) math.sqrt(tf) * (math.log(numDocs / (df + 1).toDouble) + 1.0) } } @@ -69,7 +69,7 @@ class MLlibTFIDF extends TermWeight { * Note: this is not consistent with the MapReduce seq2sparse implementation of TF-IDF weights * which is implemented using Lucene DefaultSimilarity's TF-IDF calculation: * - * sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0) + * sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0) * * @param tf term freq * @param df doc freq @@ -81,7 +81,7 @@ class MLlibTFIDF extends TermWeight { // Spark MLLib's TF-IDF weight is implemented as: // termFreq * log((numDocs + 1.0) / (docFreq + 1.0)) - tf * (math.log((numDocs + 1.0) / (df + 1).toDouble) + 1.0) + tf * math.log((numDocs + 1.0) / (df + 1).toDouble) } } http://git-wip-us.apache.org/repos/asf/mahout/blob/9b169e7e/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala index a0dec26..3ec5ec1 100644 --- a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala +++ b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala @@ -176,9 +176,9 @@ trait TFIDFtestBase extends DistributedMahoutSuite with Matchers { // 11 -> 2, 8 -> 1, 4 -> 1) abs(vectorizedDocuments(0, 0) - 0.0) should be < epsilon - abs(vectorizedDocuments(0, 13) - 2.609437) should be < epsilon - abs(vectorizedDocuments(1, 3) - 4.197224) should be < epsilon - abs(vectorizedDocuments(3, 3) - 6.295836) should be < epsilon + abs(vectorizedDocuments(0, 13) - 1.609437) should be < epsilon + abs(vectorizedDocuments(1, 3) - 2.197224) should be < epsilon + abs(vectorizedDocuments(3, 3) - 3.295836) should be < epsilon } } \ No newline at end of file
