Repository: spark
Updated Branches:
  refs/heads/master 6cf6fdf3f -> 343db392b


Added setMinCount to Word2Vec.scala

Wanted to customize the private minCount variable in the Word2Vec class. Added
a method to do so.

Author: ganonp <[email protected]>

Closes #3693 from ganonp/my-custom-spark and squashes the following commits:

ad534f2 [ganonp] made norm method public
5110a6f [ganonp] Reorganized
854958b [ganonp] Fixed Indentation for setMinCount
12ed8f9 [ganonp] Update Word2Vec.scala
76bdf5a [ganonp] Update Word2Vec.scala
ffb88bb [ganonp] Update Word2Vec.scala
5eb9100 [ganonp] Added setMinCount to Word2Vec.scala


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/343db392
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/343db392
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/343db392

Branch: refs/heads/master
Commit: 343db392b58fb33a3e4bc6fda1da69aaf686b5a9
Parents: 6cf6fdf
Author: ganonp <[email protected]>
Authored: Mon Dec 29 15:31:19 2014 -0800
Committer: Xiangrui Meng <[email protected]>
Committed: Mon Dec 29 15:31:19 2014 -0800

----------------------------------------------------------------------
 .../org/apache/spark/mllib/feature/Word2Vec.scala    | 15 +++++++++++----
 .../org/apache/spark/mllib/linalg/Vectors.scala      |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/343db392/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 7960f3c..d25a7cd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -71,7 +71,8 @@ class Word2Vec extends Serializable with Logging {
   private var numPartitions = 1
   private var numIterations = 1
   private var seed = Utils.random.nextLong()
-
+  private var minCount = 5
+  
   /**
    * Sets vector size (default: 100).
    */
@@ -114,6 +115,15 @@ class Word2Vec extends Serializable with Logging {
     this
   }
 
+  /** 
+   * Sets minCount, the minimum number of times a token must appear to be 
included in the word2vec 
+   * model's vocabulary (default: 5).
+   */
+  def setMinCount(minCount: Int): this.type = {
+    this.minCount = minCount
+    this
+  }
+  
   private val EXP_TABLE_SIZE = 1000
   private val MAX_EXP = 6
   private val MAX_CODE_LENGTH = 40
@@ -122,9 +132,6 @@ class Word2Vec extends Serializable with Logging {
   /** context words from [-window, window] */
   private val window = 5
 
-  /** minimum frequency to consider a vocabulary word */
-  private val minCount = 5
-
   private var trainWordsCount = 0
   private var vocabSize = 0
   private var vocab: Array[VocabWord] = null

http://git-wip-us.apache.org/repos/asf/spark/blob/343db392/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 47d1a76..01f3f90 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -268,7 +268,7 @@ object Vectors {
    * @param p norm.
    * @return norm in L^p^ space.
    */
-  private[spark] def norm(vector: Vector, p: Double): Double = {
+  def norm(vector: Vector, p: Double): Double = {
     require(p >= 1.0)
     val values = vector match {
       case dv: DenseVector => dv.values


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to