spark git commit: [SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature

mlnick Wed, 01 Jun 2016 10:50:27 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 71e8aaeaa -> beb4ea0b4



[SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature

## What changes were proposed in this pull request?
ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include:
* Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` 
to produce bins and ```seed``` is useless.
* Scala API docs update.
* Sync Scala and Python API docs for these changes.

## How was this patch tested?
Exist tests.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #13410 from yanboliang/spark-15587.

(cherry picked from commit 07a98ca4ce4e715ce32b4be75010e28764da459b)
Signed-off-by: Nick Pentreath <ni...@za.ibm.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/beb4ea0b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/beb4ea0b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/beb4ea0b

Branch: refs/heads/branch-2.0
Commit: beb4ea0b46998fc3270829a517f12adf4a94bb98
Parents: 71e8aae
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Jun 1 10:49:51 2016 -0700
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Wed Jun 1 10:50:03 2016 -0700

----------------------------------------------------------------------
 .../apache/spark/ml/feature/Bucketizer.scala    |  6 ++--
 .../spark/ml/feature/CountVectorizer.scala      | 10 ++-----
 .../spark/ml/feature/QuantileDiscretizer.scala  |  7 ++---
 .../org/apache/spark/ml/feature/Word2Vec.scala  |  3 +-
 python/pyspark/ml/feature.py                    | 29 +++++++++-----------
 5 files changed, 23 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/beb4ea0b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 10e622a..ff988cc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -43,7 +43,7 @@ final class Bucketizer(override val uid: String)
   /**
    * Parameter for mapping continuous features into buckets. With n+1 splits, 
there are n buckets.
    * A bucket defined by splits x,y holds values in the range [x,y) except the 
last bucket, which
-   * also includes y. Splits should be strictly increasing.
+   * also includes y. Splits should be of length >= 3 and strictly increasing.
    * Values at -inf, inf must be explicitly provided to cover all Double 
values;
    * otherwise, values outside the splits specified will be treated as errors.
    * @group param
@@ -51,8 +51,8 @@ final class Bucketizer(override val uid: String)
   val splits: DoubleArrayParam = new DoubleArrayParam(this, "splits",
     "Split points for mapping continuous features into buckets. With n+1 
splits, there are n " +
       "buckets. A bucket defined by splits x,y holds values in the range [x,y) 
except the last " +
-      "bucket, which also includes y. The splits should be strictly 
increasing. " +
-      "Values at -inf, inf must be explicitly provided to cover all Double 
values; " +
+      "bucket, which also includes y. The splits should be of length >= 3 and 
strictly " +
+      "increasing. Values at -inf, inf must be explicitly provided to cover 
all Double values; " +
       "otherwise, values outside the splits specified will be treated as 
errors.",
     Bucketizer.checkSplits)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/beb4ea0b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index fc4885b..272567d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -56,7 +56,7 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
    * If this is an integer >= 1, this specifies the number of documents the 
term must appear in;
    * if this is a double in [0,1), then this specifies the fraction of 
documents.
    *
-   * Default: 1
+   * Default: 1.0
    * @group param
    */
   val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the 
minimum number of" +
@@ -86,7 +86,7 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
    * Note that the parameter is only used in transform of 
[[CountVectorizerModel]] and does not
    * affect fitting.
    *
-   * Default: 1
+   * Default: 1.0
    * @group param
    */
   val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore 
rare words in" +
@@ -96,8 +96,6 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
     " of the document's token count). Note that the parameter is only used in 
transform of" +
     " CountVectorizerModel and does not affect fitting.", 
ParamValidators.gtEq(0.0))
 
-  setDefault(minTF -> 1)
-
   /** @group getParam */
   def getMinTF: Double = $(minTF)
 
@@ -114,7 +112,7 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
   /** @group getParam */
   def getBinary: Boolean = $(binary)
 
-  setDefault(binary -> false)
+  setDefault(vocabSize -> (1 << 18), minDF -> 1.0, minTF -> 1.0, binary -> 
false)
 }
 
 /**
@@ -145,8 +143,6 @@ class CountVectorizer(override val uid: String)
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
-  setDefault(vocabSize -> (1 << 18), minDF -> 1)
-
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): CountVectorizerModel = {
     transformSchema(dataset.schema, logging = true)

http://git-wip-us.apache.org/repos/asf/spark/blob/beb4ea0b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 6148359..1fefaa1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.{DoubleType, StructType}
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.{DoubleType, StructType}
  * Params for [[QuantileDiscretizer]].
  */
 private[feature] trait QuantileDiscretizerBase extends Params
-  with HasInputCol with HasOutputCol with HasSeed {
+  with HasInputCol with HasOutputCol {
 
   /**
    * Number of buckets (quantiles, or categories) into which data points are 
grouped. Must
@@ -91,9 +91,6 @@ final class QuantileDiscretizer(override val uid: String)
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group setParam */
-  def setSeed(value: Long): this.type = set(seed, value)
-
   override def transformSchema(schema: StructType): StructType = {
     SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType)
     val inputFields = schema.fields

http://git-wip-us.apache.org/repos/asf/spark/blob/beb4ea0b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 1b929cd..2d89eb0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -51,7 +51,8 @@ private[feature] trait Word2VecBase extends Params
   def getVectorSize: Int = $(vectorSize)
 
   /**
-   * The window size (context words from [-window, window]) default 5.
+   * The window size (context words from [-window, window]).
+   * Default: 5
    * @group expertParam
    */
   final val windowSize = new IntParam(

http://git-wip-us.apache.org/repos/asf/spark/blob/beb4ea0b/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index eb555cb..1aff2e5 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -19,8 +19,6 @@ import sys
 if sys.version > '3':
     basestring = str
 
-from py4j.java_collections import JavaArray
-
 from pyspark import since, keyword_only
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.linalg import _convert_to_vector
@@ -159,9 +157,9 @@ class Bucketizer(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable, Jav
               "Split points for mapping continuous features into buckets. With 
n+1 splits, " +
               "there are n buckets. A bucket defined by splits x,y holds 
values in the " +
               "range [x,y) except the last bucket, which also includes y. The 
splits " +
-              "should be strictly increasing. Values at -inf, inf must be 
explicitly " +
-              "provided to cover all Double values; otherwise, values outside 
the splits " +
-              "specified will be treated as errors.",
+              "should be of length >= 3 and strictly increasing. Values at 
-inf, inf must be " +
+              "explicitly provided to cover all Double values; otherwise, 
values outside the " +
+              "splits specified will be treated as errors.",
               typeConverter=TypeConverters.toListFloat)
 
     @keyword_only
@@ -1171,8 +1169,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLRead
 
 
 @inherit_doc
-class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, 
JavaMLReadable,
-                          JavaMLWritable):
+class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -1186,9 +1183,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, 
HasOutputCol, HasSeed, Jav
 
     >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], 
["values"])
     >>> qds = QuantileDiscretizer(numBuckets=2,
-    ...     inputCol="values", outputCol="buckets", seed=123, 
relativeError=0.01)
-    >>> qds.getSeed()
-    123
+    ...     inputCol="values", outputCol="buckets", relativeError=0.01)
     >>> qds.getRelativeError()
     0.01
     >>> bucketizer = qds.fit(df)
@@ -1220,9 +1215,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, 
HasOutputCol, HasSeed, Jav
                           typeConverter=TypeConverters.toFloat)
 
     @keyword_only
-    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, 
relativeError=0.001):
+    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, 
relativeError=0.001):
         """
-        __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, 
relativeError=0.001)
+        __init__(self, numBuckets=2, inputCol=None, outputCol=None, 
relativeError=0.001)
         """
         super(QuantileDiscretizer, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
@@ -1233,11 +1228,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, 
HasOutputCol, HasSeed, Jav
 
     @keyword_only
     @since("2.0.0")
-    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None,
-                  relativeError=0.001):
+    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, 
relativeError=0.001):
         """
-        setParams(self, numBuckets=2, inputCol=None, outputCol=None, 
seed=None, \
-                  relativeError=0.001)
+        setParams(self, numBuckets=2, inputCol=None, outputCol=None, 
relativeError=0.001)
         Set the params for the QuantileDiscretizer
         """
         kwargs = self.setParams._input_kwargs
@@ -1481,6 +1474,10 @@ class StandardScaler(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable, J
     Standardizes features by removing the mean and scaling to unit variance 
using column summary
     statistics on the samples in the training set.
 
+    The "unit std" is computed using the `corrected sample standard deviation \
+    
<https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_,
+    which is computed as the square root of the unbiased sample variance.
+
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), 
(Vectors.dense([2.0]),)], ["a"])
     >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature

Reply via email to