Github user MLnick commented on a diff in the pull request:
https://github.com/apache/spark/pull/19892#discussion_r161683821
--- Diff: python/pyspark/ml/feature.py ---
@@ -317,13 +317,19 @@ class BucketedRandomProjectionLSHModel(LSHModel,
JavaMLReadable, JavaMLWritable)
@inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol,
HasHandleInvalid,
- JavaMLReadable, JavaMLWritable):
- """
- Maps a column of continuous features to a column of feature buckets.
-
- >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),),
(float("nan"),)]
- >>> df = spark.createDataFrame(values, ["values"])
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols,
HasOutputCols,
+ HasHandleInvalid, JavaMLReadable, JavaMLWritable):
+ """
+ Maps a column of continuous features to a column of feature buckets.
Since 2.3.0,
+ :py:class:`Bucketizer` can map multiple columns at once by setting the
:py:attr:`inputCols`
+ parameter. Note that when both the :py:attr:`inputCol` and
:py:attr:`inputCols` parameters
+ are set, a log warning will be printed and only :py:attr:`inputCol`
will take effect, while
+ :py:attr:`inputCols` will be ignored. The :py:attr:`splits` parameter
is only used for single
+ column usage, and :py:attr:`splitsArray` is for multiple columns.
+
+ >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),
+ ... (float("nan"), 1.0), (float("nan"), 0.0)]
+ >>> df = spark.createDataFrame(values, ["values", "numbers"])
--- End diff --
`values1` & `values2`?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]