Github user jkbradley commented on a diff in the pull request:
https://github.com/apache/spark/pull/20209#discussion_r161110909
--- Diff: python/pyspark/ml/feature.py ---
@@ -1641,6 +1642,118 @@ def getDropLast(self):
return self.getOrDefault(self.dropLast)
+@inherit_doc
+class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols,
HasHandleInvalid,
+ JavaMLReadable, JavaMLWritable):
+ """
+ A one-hot encoder that maps a column of category indices to a column
of binary vectors, with
+ at most a single one-value per row that indicates the input category
index.
+ For example with 5 categories, an input value of 2.0 would map to an
output vector of
+ `[0.0, 0.0, 1.0, 0.0]`.
+ The last category is not included by default (configurable via
`dropLast`),
+ because it makes the vector entries sum up to one, and hence linearly
dependent.
+ So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+
+ Note: This is different from scikit-learn's OneHotEncoder, which keeps
all categories.
+ The output vectors are sparse.
+
+ When `handleInvalid` is configured to 'keep', an extra "category"
indicating invalid values is
+ added as last category. So when `dropLast` is true, invalid values are
encoded as all-zeros
+ vector.
+
+ Note: When encoding multi-column by using `inputCols` and `outputCols`
params, input/output
+ cols come in pairs, specified by the order in the arrays, and each
pair is treated
+ independently.
+
+ See `StringIndexer` for converting categorical values into category
indices
+
+ >>> from pyspark.ml.linalg import Vectors
+ >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
+ >>> ohe = OneHotEncoderEstimator(inputCols=["input"],
outputCols=["output"])
+ >>> model = ohe.fit(df)
+ >>> model.transform(df).head().output
+ SparseVector(2, {0: 1.0})
+ >>> ohePath = temp_path + "/oheEstimator"
+ >>> ohe.save(ohePath)
+ >>> loadedOHE = OneHotEncoderEstimator.load(ohePath)
+ >>> loadedOHE.getInputCols() == ohe.getInputCols()
+ True
+ >>> modelPath = temp_path + "/ohe-model"
+ >>> model.save(modelPath)
+ >>> loadedModel = OneHotEncoderModel.load(modelPath)
+ >>> loadedModel.categorySizes == model.categorySizes
+ True
+
+ .. versionadded:: 2.3.0
+ """
+
+ handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle
invalid data during " +
+ "transform(). Options are 'keep' (invalid data
presented as an extra " +
+ "categorical feature) or error (throw an error).
Note that this Param " +
+ "is only used during transform; during fitting,
invalid data will " +
+ "result in an error.",
+ typeConverter=TypeConverters.toString)
+
+ dropLast = Param(Params._dummy(), "dropLast", "whether to drop the
last category",
+ typeConverter=TypeConverters.toBoolean)
+
+ @keyword_only
+ def __init__(self, inputCols=None, outputCols=None,
handleInvalid="error", dropLast=True):
+ """
+ __init__(self, inputCols=None, outputCols=None,
handleInvalid="error", dropLast=True)
+ """
+ super(OneHotEncoderEstimator, self).__init__()
+ self._java_obj = self._new_java_obj(
+ "org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid)
+ self._setDefault(handleInvalid="error", dropLast=True)
+ kwargs = self._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ @since("1.4.0")
--- End diff --
2.3.0
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]