Github user WeichenXu123 commented on a diff in the pull request:
https://github.com/apache/spark/pull/19753#discussion_r151311569
--- Diff: python/pyspark/ml/feature.py ---
@@ -2565,22 +2575,28 @@ class VectorIndexer(JavaEstimator, HasInputCol,
HasOutputCol, JavaMLReadable, Ja
"(>= 2). If a feature is found to have >
maxCategories values, then " +
"it is declared continuous.",
typeConverter=TypeConverters.toInt)
+ handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle
invalid data " +
+ "(unseen labels or NULL values). Options are
'skip' (filter out " +
+ "rows with invalid data), 'error' (throw an
error), or 'keep' (put " +
+ "invalid data in a special additional bucket, at
index numCategories).",
+ typeConverter=TypeConverters.toString)
+
@keyword_only
- def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
+ def __init__(self, maxCategories=20, inputCol=None, outputCol=None,
handleInvalid="error"):
"""
- __init__(self, maxCategories=20, inputCol=None, outputCol=None)
+ __init__(self, maxCategories=20, inputCol=None, outputCol=None,
handleInvalid="error")
"""
super(VectorIndexer, self).__init__()
self._java_obj =
self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
- self._setDefault(maxCategories=20)
+ self._setDefault(maxCategories=20, handleInvalid="error")
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("1.4.0")
- def setParams(self, maxCategories=20, inputCol=None, outputCol=None):
+ def setParams(self, maxCategories=20, inputCol=None, outputCol=None,
handleInvalid="error"):
--- End diff --
ah, but, unfortunately, I think you're wrong. The `inputCol=None`
represent, if user do not specify the inputCol, there is no default value, and
exception will be thrown.
Duplicating default params is an issue, but already exists in all the
pyspark.ml estimator/models.
e.g., you can check `StringIndexer` in pyspark, it also has `handleInvalid`
param.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]