Github user hhbyyh commented on a diff in the pull request:
https://github.com/apache/spark/pull/17673#discussion_r148967559
--- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
---
@@ -105,6 +106,56 @@ private[feature] trait Word2VecBase extends Params
/** @group getParam */
def getMaxSentenceLength: Int = $(maxSentenceLength)
+ /**
+ * Number of negative samples to use with CBOW based estimation.
+ * This parameter is ignored for SkipGram-Hierachical Softmax based
estimation.
+ * Default: 15
+ * @group param
+ */
+ final val numNegativeSamples = new IntParam(this, "numNegativeSamples",
"Number of negative" +
+ " samples to use with CBOW estimation", ParamValidators.gt(0))
+ setDefault(numNegativeSamples -> 15)
+
+ /** @group getParam */
+ def getNumNegativeSamples: Int = $(numNegativeSamples)
+
+ /**
+ * Unigram table size. The unigram table is used to generate negative
samples.
+ * This parameter is ignored for SkipGram based estimation.
+ * Default: 100 million
+ * @group param
+ */
+ final val unigramTableSize = new IntParam(this, "unigramTableSize", "Max
table size to " +
+ "use for generating negative samples", ParamValidators.gt(1))
+ setDefault(unigramTableSize -> 100*1000*1000)
+
+ /** @group getParam */
+ def getUnigramTableSize: Int = $(unigramTableSize)
+
+ /**
+ * Sample threshold (parameter t <a
href="https://arxiv.org/pdf/1310.4546.pdf">here</a>)
+ * This parameter is ignored for SkipGram based estimation.
+ * Default: 0.0
+ * @group param
+ */
+ final val samplingThreshold = new DoubleParam(this, "samplingThreshold",
"Sampling threshold" +
+ " to reduce words with high frequencies", ParamValidators.gtEq(0.0))
+ setDefault(samplingThreshold -> 0.0)
+
+ /** @group getParam */
+ def getSamplingThreshold: Double = $(samplingThreshold)
+
+ /**
+ * Solver used for Word2Vec.
+ * Supported options are "sg-hs" and "cbow-ns"
--- End diff --
add more details:
sg-hs: Skip-gram with hierarchical softmax
cbow: ...
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]