[GitHub] spark pull request #19527: [SPARK-13030][ML] Create OneHotEncoderEstimator f...

viirya Sun, 24 Dec 2017 18:49:11 -0800

Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19527#discussion_r158615273
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala 
---
    @@ -0,0 +1,479 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.ml.feature
    +
    +import org.apache.hadoop.fs.Path
    +
    +import org.apache.spark.SparkException
    +import org.apache.spark.annotation.Since
    +import org.apache.spark.ml.{Estimator, Model}
    +import org.apache.spark.ml.attribute._
    +import org.apache.spark.ml.linalg.Vectors
    +import org.apache.spark.ml.param._
    +import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCols, 
HasOutputCols}
    +import org.apache.spark.ml.util._
    +import org.apache.spark.sql.{DataFrame, Dataset}
    +import org.apache.spark.sql.expressions.UserDefinedFunction
    +import org.apache.spark.sql.functions.{col, lit, udf}
    +import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, 
StructType}
    +
    +/** Private trait for params and common methods for OneHotEncoderEstimator 
and OneHotEncoderModel */
    +private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
    +    with HasInputCols with HasOutputCols {
    +
    +  /**
    +   * Param for how to handle invalid data.
    +   * Options are 'keep' (invalid data presented as an extra categorical 
feature) or
    +   * 'error' (throw an error).
    +   * Default: "error"
    +   * @group param
    +   */
    +  @Since("2.3.0")
    +  override val handleInvalid: Param[String] = new Param[String](this, 
"handleInvalid",
    +    "How to handle invalid data " +
    +    "Options are 'keep' (invalid data presented as an extra categorical 
feature) " +
    +    "or error (throw an error).",
    +    
ParamValidators.inArray(OneHotEncoderEstimator.supportedHandleInvalids))
    +
    +  setDefault(handleInvalid, OneHotEncoderEstimator.ERROR_INVALID)
    +
    +  /**
    +   * Whether to drop the last category in the encoded vector (default: 
true)
    +   * @group param
    +   */
    +  @Since("2.3.0")
    +  final val dropLast: BooleanParam =
    +    new BooleanParam(this, "dropLast", "whether to drop the last category")
    +  setDefault(dropLast -> true)
    +
    +  /** @group getParam */
    +  @Since("2.3.0")
    +  def getDropLast: Boolean = $(dropLast)
    +
    +  protected def validateAndTransformSchema(schema: StructType): StructType 
= {
    +    val inputColNames = $(inputCols)
    +    val outputColNames = $(outputCols)
    +    val existingFields = schema.fields
    +
    +    require(inputColNames.length == outputColNames.length,
    +      s"The number of input columns ${inputColNames.length} must be the 
same as the number of " +
    +        s"output columns ${outputColNames.length}.")
    +
    +    inputColNames.zip(outputColNames).map { case (inputColName, 
outputColName) =>
    +      require(schema(inputColName).dataType.isInstanceOf[NumericType],
    +        s"Input column must be of type NumericType but got 
${schema(inputColName).dataType}")
    +      require(!existingFields.exists(_.name == outputColName),
    +        s"Output column $outputColName already exists.")
    +    }
    +
    +    // Prepares output columns with proper attributes by examining input 
columns.
    +    val inputFields = $(inputCols).map(schema(_))
    +    val keepInvalid = $(handleInvalid) == 
OneHotEncoderEstimator.KEEP_INVALID
    +
    +    val outputFields = inputFields.zip(outputColNames).map { case 
(inputField, outputColName) =>
    +      OneHotEncoderCommon.transformOutputColumnSchema(
    +        inputField, $(dropLast), outputColName, keepInvalid)
    +    }
    +    StructType(schema.fields ++ outputFields)
    +  }
    +}
    +
    +/**
    + * A one-hot encoder that maps a column of category indices to a column of 
binary vectors, with
    + * at most a single one-value per row that indicates the input category 
index.
    + * For example with 5 categories, an input value of 2.0 would map to an 
output vector of
    + * `[0.0, 0.0, 1.0, 0.0]`.
    + * The last category is not included by default (configurable via 
`dropLast`),
    + * because it makes the vector entries sum up to one, and hence linearly 
dependent.
    + * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
    + *
    + * @note This is different from scikit-learn's OneHotEncoder, which keeps 
all categories.
    + * The output vectors are sparse.
    + *
    + * When `handleInvalid` is configured to 'keep', an extra "category" 
indicating invalid values is
    + * added as last category. So when `dropLast` is true, invalid values are 
encoded as all-zeros
    + * vector.
    + *
    + * @see `StringIndexer` for converting categorical values into category 
indices
    + */
    +@Since("2.3.0")
    +class OneHotEncoderEstimator @Since("2.3.0") (@Since("2.3.0") override val 
uid: String)
    +    extends Estimator[OneHotEncoderModel] with OneHotEncoderBase with 
DefaultParamsWritable {
    +
    +  @Since("2.3.0")
    +  def this() = this(Identifiable.randomUID("oneHotEncoder"))
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setInputCols(values: Array[String]): this.type = set(inputCols, 
values)
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setOutputCols(values: Array[String]): this.type = set(outputCols, 
values)
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setDropLast(value: Boolean): this.type = set(dropLast, value)
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setHandleInvalid(value: String): this.type = set(handleInvalid, 
value)
    +
    +  @Since("2.3.0")
    +  override def transformSchema(schema: StructType): StructType = {
    +    validateAndTransformSchema(schema)
    +  }
    +
    +  @Since("2.3.0")
    +  override def fit(dataset: Dataset[_]): OneHotEncoderModel = {
    +    val transformedSchema = transformSchema(dataset.schema)
    +    val categorySizes = new Array[Int]($(outputCols).length)
    +
    +    val columnToScanIndices = $(outputCols).zipWithIndex.flatMap { case 
(outputColName, idx) =>
    +      val numOfAttrs = AttributeGroup.fromStructField(
    +        transformedSchema(outputColName)).size
    +      if (numOfAttrs < 0) {
    +        Some(idx)
    +      } else {
    +        categorySizes(idx) = numOfAttrs
    +        None
    +      }
    +    }
    +
    +    // Some input columns don't have attributes or their attributes don't 
have necessary info.
    +    // We need to scan the data to get the number of values for each 
column.
    +    if (columnToScanIndices.length > 0) {
    +      val inputColNames = columnToScanIndices.map($(inputCols)(_))
    +      val outputColNames = columnToScanIndices.map($(outputCols)(_))
    +      val keepInvalid = $(handleInvalid) == 
OneHotEncoderEstimator.KEEP_INVALID
    +      val attrGroups = OneHotEncoderCommon.getOutputAttrGroupFromData(
    +        dataset, $(dropLast), inputColNames, outputColNames, keepInvalid)
    +      attrGroups.zip(columnToScanIndices).foreach { case (attrGroup, idx) 
=>
    +        categorySizes(idx) = attrGroup.size
    +      }
    +    }
    +
    +    val model = new OneHotEncoderModel(uid, categorySizes).setParent(this)
    +    copyValues(model)
    +  }
    +
    +  @Since("2.3.0")
    +  override def copy(extra: ParamMap): OneHotEncoderEstimator = 
defaultCopy(extra)
    +}
    +
    +@Since("2.3.0")
    +object OneHotEncoderEstimator extends 
DefaultParamsReadable[OneHotEncoderEstimator] {
    +
    +  private[feature] val KEEP_INVALID: String = "keep"
    +  private[feature] val ERROR_INVALID: String = "error"
    +  private[feature] val supportedHandleInvalids: Array[String] = 
Array(KEEP_INVALID, ERROR_INVALID)
    +
    +  @Since("2.3.0")
    +  override def load(path: String): OneHotEncoderEstimator = 
super.load(path)
    +}
    +
    +@Since("2.3.0")
    +class OneHotEncoderModel private[ml] (
    +    @Since("2.3.0") override val uid: String,
    +    @Since("2.3.0") val categorySizes: Array[Int])
    +  extends Model[OneHotEncoderModel] with OneHotEncoderBase with MLWritable 
{
    +
    +  import OneHotEncoderModel._
    +
    +  private def encoder: UserDefinedFunction = {
    +    val oneValue = Array(1.0)
    +    val emptyValues = Array.empty[Double]
    +    val emptyIndices = Array.empty[Int]
    +    val dropLast = getDropLast
    +    val handleInvalid = getHandleInvalid
    +    val keepInvalid = handleInvalid == OneHotEncoderEstimator.KEEP_INVALID
    +
    +    udf { (label: Double, size: Int) =>
    +      val numCategory = if (!dropLast && keepInvalid) {
    +        // When `handleInvalid` is 'keep' and `dropLast` is false, the 
last category is
    +        // for invalid data.
    +        size - 1
    +      } else {
    +        size
    +      }
    +
    +      if (label < numCategory) {
    +        Vectors.sparse(size, Array(label.toInt), oneValue)
    +      } else if (label == numCategory && dropLast && !keepInvalid) {
    +        Vectors.sparse(size, emptyIndices, emptyValues)
    +      } else if (dropLast && keepInvalid) {
    +        Vectors.sparse(size, emptyIndices, emptyValues)
    +      } else if (keepInvalid) {
    +        Vectors.sparse(size, Array(size - 1), oneValue)
    +      } else {
    +        assert(handleInvalid == OneHotEncoderEstimator.ERROR_INVALID)
    +        throw new SparkException(s"Unseen value: $label. To handle unseen 
values, " +
    +          s"set Param handleInvalid to 
${OneHotEncoderEstimator.KEEP_INVALID}.")
    +      }
    +    }
    +  }
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setInputCols(values: Array[String]): this.type = set(inputCols, 
values)
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setOutputCols(values: Array[String]): this.type = set(outputCols, 
values)
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setDropLast(value: Boolean): this.type = set(dropLast, value)
    +
    +  /** @group setParam */
    +  @Since("2.3.0")
    +  def setHandleInvalid(value: String): this.type = set(handleInvalid, 
value)
    +
    +  @Since("2.3.0")
    +  override def transformSchema(schema: StructType): StructType = {
    +    val inputColNames = $(inputCols)
    +    val outputColNames = $(outputCols)
    +
    +    require(inputColNames.length == categorySizes.length,
    +      s"The number of input columns ${inputColNames.length} must be the 
same as the number of " +
    +        s"features ${categorySizes.length} during fitting.")
    +
    +    val transformedSchema = validateAndTransformSchema(schema)
    +    verifyNumOfValues(transformedSchema)
    +  }
    +
    +  /**
    +   * If the metadata of input columns also specifies the number of 
categories, we need to
    +   * compare with expected category number obtained during fitting. 
Mismatched numbers will
    +   * cause exception.
    +   */
    +  private def verifyNumOfValues(schema: StructType): StructType = {
    +    $(outputCols).zipWithIndex.foreach { case (outputColName, idx) =>
    +      val inputColName = $(inputCols)(idx)
    +      val attrGroup = AttributeGroup.fromStructField(schema(outputColName))
    --- End diff --
    
    Here we use `outputColName` to get the size of attribute group of the 
output column. For example if the first input column specifies 5 categorical 
values, the size of the attribute group is 5.




---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #19527: [SPARK-13030][ML] Create OneHotEncoderEstimator f...

Reply via email to