mahout git commit: NOJIRA: simple classes for Lucene DefaultSimilarity and MLlib TF-IDF calculations closes apache/mahout#75

apalumbo Mon, 09 Mar 2015 17:17:07 -0700

Repository: mahout
Updated Branches:
  refs/heads/master b0ee8e265 -> d5ea1f1be



NOJIRA: simple classes for Lucene DefaultSimilarity and MLlib TF-IDF 
calculations closes apache/mahout#75


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d5ea1f1b
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d5ea1f1b
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d5ea1f1b

Branch: refs/heads/master
Commit: d5ea1f1be4d8ee6e3ceb7863322921c331a6c0a2
Parents: b0ee8e2
Author: Andrew Palumbo <[email protected]>
Authored: Mon Mar 9 20:15:00 2015 -0400
Committer: Andrew Palumbo <[email protected]>
Committed: Mon Mar 9 20:15:00 2015 -0400

----------------------------------------------------------------------
 .../org/apache/mahout/nlp/tfidf/TFIDF.scala     | 112 +++++++++++
 .../apache/mahout/nlp/tfidf/TFIDFtestBase.scala | 184 +++++++++++++++++++
 .../mahout/nlp/tfidf/TFIDFSparkTestSuite.scala  |  25 +++
 3 files changed, 321 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/d5ea1f1b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala 
b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
new file mode 100644
index 0000000..5b78e18
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.nlp.tfidf
+
+trait TermWeight {
+
+  /**
+   * @param tf term freq
+   * @param df doc freq
+   * @param length Length of the document
+   * @param numDocs the total number of docs
+   */
+  def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double
+}
+
+
+class TFIDF extends TermWeight {
+
+  /**
+   * Calculate TF-IDF weight.
+   *
+   * Lucene 4.6's DefaultSimilarity TF-IDF calculation uses the formula:
+   *
+   *   sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0)
+   *
+   * Note: this is consistent with the MapReduce seq2sparse implementation of 
TF-IDF weights
+   * and is slightly different from Spark MLlib's TD-IDF calculation which is 
implemented as:
+   *
+   *   termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
+   *
+   * @param tf term freq
+   * @param df doc freq
+   * @param length Length of the document - UNUSED
+   * @param numDocs the total number of docs
+   * @return The TF-IDF weight as calculated by Lucene 4.6's DefaultSimilarity
+   */
+  def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = {
+
+    // Lucene 4.6 DefaultSimilarity's TF-IDF is implemented as:
+    // sqrt(tf) * (log(numDocs / (df+1)) + 1)
+    math.sqrt(tf) * (math.log(numDocs / (df + 1).toDouble) + 1.0)
+  }
+}
+
+class MLlibTFIDF extends TermWeight {
+
+  /**
+   * Calculate TF-IDF weight with IDF formula used by Spark MLlib's IDF:
+   *
+   *   termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
+   *
+   * Use this weight if working with MLLib vectorized documents.
+   *
+   * Note: this is not consistent with the MapReduce seq2sparse implementation 
of TF-IDF weights
+   * which is implemented using Lucene DefaultSimilarity's TF-IDF calculation:
+   *
+   *   sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0)
+   *
+   * @param tf term freq
+   * @param df doc freq
+   * @param length Length of the document - UNUSED
+   * @param numDocs the total number of docs
+   * @return The TF-IDF weight as calculated by Spark MLlib's IDF
+   */
+  def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = {
+
+    // Spark MLLib's TF-IDF weight is implemented as:
+    // termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
+    tf * (math.log((numDocs + 1.0) / (df + 1).toDouble) + 1.0)
+  }
+}
+
+class TF extends TermWeight {
+
+  /**
+   * For TF Weight simply return the absolute TF.
+   *
+   * Note: We do not use Lucene 4.6's DefaultSimilarity's TF calculation here
+   * which returns:
+   *
+   *   sqrt(tf)
+   *
+   * this is consistent with the MapReduce seq2sparse implementation of TF 
weights.
+   *
+   * @param tf term freq
+   * @param df doc freq - UNUSED
+   * @param length Length of the document - UNUSED
+   * @param numDocs the total number of docs - UNUSED
+   * based on term frequency only - UNUSED
+   * @return The weight = tf param
+   */
+  def calculate(tf: Int, df: Int = -Int.MaxValue, length: Int = -Int.MaxValue, 
numDocs: Int = -Int.MaxValue): Double = {
+    tf
+  }
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/d5ea1f1b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
----------------------------------------------------------------------
diff --git 
a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala 
b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
new file mode 100644
index 0000000..a0dec26
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.nlp.tfidf
+
+import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.test.DistributedMahoutSuite
+import org.scalatest.{FunSuite, Matchers}
+import scala.collection._
+import RLikeOps._
+import scala.math._
+
+
+trait TFIDFtestBase extends DistributedMahoutSuite with Matchers {
+  this: FunSuite =>
+
+  val epsilon = 1E-6
+
+  val documents: List[(Int, String)] = List(
+    (1, "the first document contains 5 terms"),
+    (2, "document two document contains 4 terms"),
+    (3, "document three three terms"),
+    (4, "each document including this document contain the term document"))
+
+  def createDictionaryAndDfMaps(documents: List[(Int, String)]): (Map[String, 
Int], Map[Int, Int]) = {
+
+    // get a tf count for the entire dictionary
+    val dictMap = documents.unzip._2.mkString(" ").toLowerCase.split(" 
").groupBy(identity).mapValues(_.length)
+
+    // create a dictionary with an index for each term
+    val dictIndex = dictMap.zipWithIndex.map(x => x._1._1 -> x._2).toMap
+
+    val docFrequencyCount = new Array[Int](dictMap.size)
+
+    for (token <- dictMap) {
+      for (doc <- documents) {
+        // parse the string and get a word then increment the df count for 
that word
+        if (doc._2.toLowerCase.split(" ").contains(token._1)) {
+          docFrequencyCount(dictIndex(token._1)) += 1
+        }
+      }
+    }
+
+    val docFrequencyMap = docFrequencyCount.zipWithIndex.map(x => x._2 -> 
x._1).toMap
+
+    (dictIndex, docFrequencyMap)
+  }
+
+  def vectorizeDocument(document: String,
+                        dictionaryMap: Map[String, Int],
+                        dfMap: Map[Int, Int], weight: TermWeight = new TFIDF): 
Vector = {
+
+    val wordCounts = document.toLowerCase.split(" 
").groupBy(identity).mapValues(_.length)
+
+    val vec = new RandomAccessSparseVector(dictionaryMap.size)
+
+    val totalDFSize = dictionaryMap.size
+    val docSize = wordCounts.size
+
+    for (word <- wordCounts) {
+      val term = word._1
+      if (dictionaryMap.contains(term)) {
+        val termFreq = word._2
+        val dictIndex = dictionaryMap(term)
+        val docFreq = dfMap(dictIndex)
+        val currentWeight = weight.calculate(termFreq, docFreq.toInt, docSize, 
totalDFSize.toInt)
+        vec(dictIndex)= currentWeight
+      }
+    }
+    vec
+  }
+
+  test("TF test") {
+
+    val (dictionary, dfMap) = createDictionaryAndDfMaps(documents)
+
+    val tf: TermWeight = new TF()
+
+    val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, 
dictionary.size)
+
+    for (doc <- documents) {
+      vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, 
dictionary, dfMap, tf)
+    }
+
+    // corpus:
+    //  (1, "the first document contains 5 terms"),
+    //  (2, "document two document contains 4 terms"),
+    //  (3, "document three three terms"),
+    //  (4, "each document including this document contain the term document")
+
+    // dictonary:
+    //  (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 
-> 6, contain -> 7,
+    //   each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, 
the -> 13)
+
+    // dfMap:
+    //  (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 
-> 1, 7 -> 1, 3 -> 4,
+    //   11 -> 2, 8 -> 1, 4 -> 1)
+
+    vectorizedDocuments(0, 0).toInt should be (0)
+    vectorizedDocuments(0, 13).toInt should be (1)
+    vectorizedDocuments(1, 3).toInt should be (2)
+    vectorizedDocuments(3, 3).toInt should be (3)
+
+  }
+
+
+  test("TFIDF test") {
+    val (dictionary, dfMap) = createDictionaryAndDfMaps(documents)
+
+    val tfidf: TermWeight = new TFIDF()
+
+    val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, 
dictionary.size)
+
+    for (doc <- documents) {
+      vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, 
dictionary, dfMap, tfidf)
+    }
+
+    // corpus:
+    //  (1, "the first document contains 5 terms"),
+    //  (2, "document two document contains 4 terms"),
+    //  (3, "document three three terms"),
+    //  (4, "each document including this document contain the term document")
+
+    // dictonary:
+    //  (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 
-> 6, contain -> 7,
+    //   each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, 
the -> 13)
+
+    // dfMap:
+    //  (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 
-> 1, 7 -> 1, 3 -> 4,
+    //   11 -> 2, 8 -> 1, 4 -> 1)
+
+    abs(vectorizedDocuments(0, 0) -  0.0) should be < epsilon
+    abs(vectorizedDocuments(0, 13) - 2.540445) should be < epsilon
+    abs(vectorizedDocuments(1, 3) - 2.870315) should be < epsilon
+    abs(vectorizedDocuments(3, 3) - 3.515403) should be < epsilon
+  }
+
+  test("MLlib TFIDF test") {
+    val (dictionary, dfMap) = createDictionaryAndDfMaps(documents)
+
+    val tfidf: TermWeight = new MLlibTFIDF()
+
+    val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, 
dictionary.size)
+
+    for (doc <- documents) {
+      vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, 
dictionary, dfMap, tfidf)
+    }
+
+    // corpus:
+    //  (1, "the first document contains 5 terms"),
+    //  (2, "document two document contains 4 terms"),
+    //  (3, "document three three terms"),
+    //  (4, "each document including this document contain the term document")
+
+    // dictonary:
+    //  (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 
-> 6, contain -> 7,
+    //   each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, 
the -> 13)
+
+    // dfMap:
+    //  (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 
-> 1, 7 -> 1, 3 -> 4,
+    //   11 -> 2, 8 -> 1, 4 -> 1)
+
+    abs(vectorizedDocuments(0, 0) -  0.0) should be < epsilon
+    abs(vectorizedDocuments(0, 13) - 2.609437) should be < epsilon
+    abs(vectorizedDocuments(1, 3) - 4.197224) should be < epsilon
+    abs(vectorizedDocuments(3, 3) - 6.295836) should be < epsilon
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/d5ea1f1b/spark/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFSparkTestSuite.scala
----------------------------------------------------------------------
diff --git 
a/spark/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFSparkTestSuite.scala 
b/spark/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFSparkTestSuite.scala
new file mode 100644
index 0000000..bd615da
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFSparkTestSuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.nlp.tfidf
+
+
+import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.apache.mahout.test.MahoutSuite
+import org.scalatest.FunSuite
+
+class TFIDFSparkTestSuite extends FunSuite with MahoutSuite with 
DistributedSparkSuite with TFIDFtestBase
+

mahout git commit: NOJIRA: simple classes for Lucene DefaultSimilarity and MLlib TF-IDF calculations closes apache/mahout#75

Reply via email to