[GitHub] spark pull request #23100: [SPARK-26133][ML] Remove deprecated OneHotEncoder...

viirya Thu, 22 Nov 2018 07:20:26 -0800

Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/23100#discussion_r235760329
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala ---
    @@ -17,126 +17,512 @@
     
     package org.apache.spark.ml.feature
     
    +import org.apache.hadoop.fs.Path
    +
    +import org.apache.spark.SparkException
     import org.apache.spark.annotation.Since
    -import org.apache.spark.ml.Transformer
    +import org.apache.spark.ml.{Estimator, Model}
     import org.apache.spark.ml.attribute._
     import org.apache.spark.ml.linalg.Vectors
     import org.apache.spark.ml.param._
    -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
    +import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCols, 
HasOutputCols}
     import org.apache.spark.ml.util._
     import org.apache.spark.sql.{DataFrame, Dataset}
    -import org.apache.spark.sql.functions.{col, udf}
    -import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
    +import org.apache.spark.sql.expressions.UserDefinedFunction
    +import org.apache.spark.sql.functions.{col, lit, udf}
    +import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
    +
    +/** Private trait for params and common methods for OneHotEncoder and 
OneHotEncoderModel */
    +private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
    +    with HasInputCols with HasOutputCols {
    +
    +  /**
    +   * Param for how to handle invalid data during transform().
    +   * Options are 'keep' (invalid data presented as an extra categorical 
feature) or
    +   * 'error' (throw an error).
    +   * Note that this Param is only used during transform; during fitting, 
invalid data
    +   * will result in an error.
    +   * Default: "error"
    +   * @group param
    +   */
    +  @Since("2.3.0")
    +  override val handleInvalid: Param[String] = new Param[String](this, 
"handleInvalid",
    +    "How to handle invalid data during transform(). " +
    +    "Options are 'keep' (invalid data presented as an extra categorical 
feature) " +
    +    "or error (throw an error). Note that this Param is only used during 
transform; " +
    +    "during fitting, invalid data will result in an error.",
    +    ParamValidators.inArray(OneHotEncoder.supportedHandleInvalids))
    +
    +  setDefault(handleInvalid, OneHotEncoder.ERROR_INVALID)
    +
    +  /**
    +   * Whether to drop the last category in the encoded vector (default: 
true)
    +   * @group param
    +   */
    +  @Since("2.3.0")
    +  final val dropLast: BooleanParam =
    +    new BooleanParam(this, "dropLast", "whether to drop the last category")
    +  setDefault(dropLast -> true)
    +
    +  /** @group getParam */
    +  @Since("2.3.0")
    +  def getDropLast: Boolean = $(dropLast)
    +
    +  protected def validateAndTransformSchema(
    +      schema: StructType,
    +      dropLast: Boolean,
    +      keepInvalid: Boolean): StructType = {
    +    val inputColNames = $(inputCols)
    +    val outputColNames = $(outputCols)
    +
    +    require(inputColNames.length == outputColNames.length,
    +      s"The number of input columns ${inputColNames.length} must be the 
same as the number of " +
    +        s"output columns ${outputColNames.length}.")
    +
    +    // Input columns must be NumericType.
    +    inputColNames.foreach(SchemaUtils.checkNumericType(schema, _))
    +
    +    // Prepares output columns with proper attributes by examining input 
columns.
    +    val inputFields = $(inputCols).map(schema(_))
    +
    +    val outputFields = inputFields.zip(outputColNames).map { case 
(inputField, outputColName) =>
    +      OneHotEncoderCommon.transformOutputColumnSchema(
    +        inputField, outputColName, dropLast, keepInvalid)
    +    }
    +    outputFields.foldLeft(schema) { case (newSchema, outputField) =>
    +      SchemaUtils.appendColumn(newSchema, outputField)
    +    }
    +  }
    +}
     
     /**
      * A one-hot encoder that maps a column of category indices to a column of 
binary vectors, with
      * at most a single one-value per row that indicates the input category 
index.
      * For example with 5 categories, an input value of 2.0 would map to an 
output vector of
      * `[0.0, 0.0, 1.0, 0.0]`.
    - * The last category is not included by default (configurable via 
`OneHotEncoder!.dropLast`
    + * The last category is not included by default (configurable via 
`dropLast`),
      * because it makes the vector entries sum up to one, and hence linearly 
dependent.
      * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
      *
      * @note This is different from scikit-learn's OneHotEncoder, which keeps 
all categories.
      * The output vectors are sparse.
      *
    + * When `handleInvalid` is configured to 'keep', an extra "category" 
indicating invalid values is
    + * added as last category. So when `dropLast` is true, invalid values are 
encoded as all-zeros
    + * vector.
    + *
    + * @note When encoding multi-column by using `inputCols` and `outputCols` 
params, input/output cols
    + * come in pairs, specified by the order in the arrays, and each pair is 
treated independently.
    + *
      * @see `StringIndexer` for converting categorical values into category 
indices
    - * @deprecated `OneHotEncoderEstimator` will be renamed `OneHotEncoder` 
and this `OneHotEncoder`
    - * will be removed in 3.0.0.
      */
    -@Since("1.4.0")
    -@deprecated("`OneHotEncoderEstimator` will be renamed `OneHotEncoder` and 
this `OneHotEncoder`" +
    -  " will be removed in 3.0.0.", "2.3.0")
    -class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: 
String) extends Transformer
    -  with HasInputCol with HasOutputCol with DefaultParamsWritable {
    +@Since("2.3.0")
    +class OneHotEncoder @Since("2.3.0") (@Since("2.3.0") override val uid: 
String)
    --- End diff --
    
    In this renaming case, I'm not sure we should use the `Since` from old 
`OneHotEncoder` (`1.4.0`) or `OneHotEncoderEstimator` (`2.3.0`). Now I use 
`OneHotEncoderEstimator`'s.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #23100: [SPARK-26133][ML] Remove deprecated OneHotEncoder...

Reply via email to