Github user MLnick commented on a diff in the pull request:
https://github.com/apache/spark/pull/19892#discussion_r161719111
--- Diff: python/pyspark/ml/feature.py ---
@@ -317,26 +317,34 @@ class BucketedRandomProjectionLSHModel(LSHModel,
JavaMLReadable, JavaMLWritable)
@inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol,
HasHandleInvalid,
- JavaMLReadable, JavaMLWritable):
- """
- Maps a column of continuous features to a column of feature buckets.
-
- >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),),
(float("nan"),)]
- >>> df = spark.createDataFrame(values, ["values"])
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols,
HasOutputCols,
+ HasHandleInvalid, JavaMLReadable, JavaMLWritable):
+ """
+ Maps a column of continuous features to a column of feature buckets.
Since 2.3.0,
+ :py:class:`Bucketizer` can map multiple columns at once by setting the
:py:attr:`inputCols`
+ parameter. Note that when both the :py:attr:`inputCol` and
:py:attr:`inputCols` parameters
+ are set, a log warning will be printed and only :py:attr:`inputCol`
will take effect, while
+ :py:attr:`inputCols` will be ignored. The :py:attr:`splits` parameter
is only used for single
+ column usage, and :py:attr:`splitsArray` is for multiple columns.
+
+ >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),
+ ... (float("nan"), 1.0), (float("nan"), 0.0)]
+ >>> df = spark.createDataFrame(values, ["values1", "values2"])
>>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4,
float("inf")],
- ... inputCol="values", outputCol="buckets")
- >>> bucketed =
bucketizer.setHandleInvalid("keep").transform(df).collect()
- >>> len(bucketed)
- 6
- >>> bucketed[0].buckets
- 0.0
- >>> bucketed[1].buckets
- 0.0
- >>> bucketed[2].buckets
- 1.0
- >>> bucketed[3].buckets
- 2.0
+ ... inputCol="values1", outputCol="buckets")
+ >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df)
--- End diff --
It may actually be neater to show only `values1` and `bucketed` - so
perhaps `.transform(df.select('values1'))`?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]