Github user shubhamchopra commented on a diff in the pull request:
https://github.com/apache/spark/pull/17673#discussion_r143516595
--- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
---
@@ -106,6 +106,45 @@ private[feature] trait Word2VecBase extends Params
/** @group getParam */
def getMaxSentenceLength: Int = $(maxSentenceLength)
+ /**
+ * Number of negative samples to use with CBOW based estimation.
+ * This parameter is ignored for SkipGram based estimation.
+ * Default: 15
+ * @group param
+ */
+ final val negativeSamples = new IntParam(this, "negativeSamples",
"Number of negative samples " +
+ "to use with CBOW estimation", ParamValidators.gt(0))
+ setDefault(negativeSamples -> 15)
+
+ /** @group getParam */
+ def getNegativeSamples: Int = $(negativeSamples)
+
+ /**
+ * Unigram table size. The unigram table is used to generate negative
samples.
+ * This parameter is ignored for SkipGram based estimation.
+ * Default: 100 million
+ * @group param
+ */
+ final val unigramTableSize = new IntParam(this, "unigramTableSize", "Max
table size to " +
+ "use for generating negative samples", ParamValidators.gt(1))
+ setDefault(unigramTableSize -> 100*1000*1000)
+
+ /** @group getParam */
+ def getUnigramTableSize: Int = $(unigramTableSize)
+
+ /**
+ * Sample threshold (parameter t <a
href="https://arxiv.org/pdf/1310.4546.pdf">here</a>)
+ * This parameter is ignored for SkipGram based estimation.
+ * Default: 0.0
+ * @group param
+ */
+ final val sample = new DoubleParam(this, "samplingThreshold", "Sampling
threshold to reduce " +
--- End diff --
Done
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]