Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/23100#discussion_r236471032 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala --- @@ -17,126 +17,512 @@ package org.apache.spark.ml.feature +import org.apache.hadoop.fs.Path + +import org.apache.spark.SparkException import org.apache.spark.annotation.Since -import org.apache.spark.ml.Transformer +import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCols, HasOutputCols} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions.{col, udf} -import org.apache.spark.sql.types.{DoubleType, NumericType, StructType} +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.functions.{col, lit, udf} +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} + +/** Private trait for params and common methods for OneHotEncoder and OneHotEncoderModel */ +private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid + with HasInputCols with HasOutputCols { + + /** + * Param for how to handle invalid data during transform(). + * Options are 'keep' (invalid data presented as an extra categorical feature) or + * 'error' (throw an error). + * Note that this Param is only used during transform; during fitting, invalid data + * will result in an error. + * Default: "error" + * @group param + */ + @Since("2.3.0") --- End diff -- Yea, looks like we should make it `3.0`.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org