[GitHub] [spark] zhengruifeng commented on a change in pull request #20732: [SPARK-23578][ML] Add multicolumn support for Binarizer

GitBox Mon, 23 Sep 2019 01:53:42 -0700

zhengruifeng commented on a change in pull request #20732: [SPARK-23578][ML] 
Add multicolumn support for Binarizer
URL: https://github.com/apache/spark/pull/20732#discussion_r327002945


 ##########
 File path: mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
 ##########
 @@ -45,66 +47,117 @@ final class Binarizer @Since("1.4.0") (@Since("1.4.0") 
override val uid: String)
    * The features greater than the threshold, will be binarized to 1.0.
    * The features equal to or less than the threshold, will be binarized to 
0.0.
    * Default: 0.0
+   *
    * @group param
    */
   @Since("1.4.0")
   val threshold: DoubleParam =
-    new DoubleParam(this, "threshold", "threshold used to binarize continuous 
features")
+  new DoubleParam(this, "threshold", "threshold used to binarize continuous 
features")
+
+  /** @group param */
+  @Since("2.3.1")
+  val thresholds: DoubleArrayParam =
+    new DoubleArrayParam(this, "thresholds", "thresholds used to binarize 
continuous features")
 
   /** @group getParam */
   @Since("1.4.0")
   def getThreshold: Double = $(threshold)
 
+  /** @group getParam */
+  @Since("2.3.1")
+  def getThresholds: Array[Double] = $(thresholds)
+
   /** @group setParam */
   @Since("1.4.0")
   def setThreshold(value: Double): this.type = set(threshold, value)
 
   setDefault(threshold -> 0.0)
 
+  /** @group setParam */
+  @Since("2.3.1")
+  def setThresholds(value: Array[Double]): this.type = set(thresholds, value)
+
   /** @group setParam */
   @Since("1.4.0")
   def setInputCol(value: String): this.type = set(inputCol, value)
 
+  /** @group setParam */
+  @Since("2.3.1")
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
   /** @group setParam */
   @Since("1.4.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
+  @Since("2.3.1")
+  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
+
+  @Since("2.3.1")
+  private[feature] def isBinarizerMultipleColumns(): Boolean = {
+    if (isSet(inputCols) && isSet(inputCol)) {
+      logWarning("Both `inputCol` and `inputCols` are set, we ignore 
`inputCols` and this " +
+        "`Binarizer` only maps one column specified by `inputCol`")
+      false
+    } else if (isSet(inputCols)) {
+      true
+    } else {
+      false
+    }
+  }
+
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val outputSchema = transformSchema(dataset.schema, logging = true)
     val schema = dataset.schema
-    val inputType = schema($(inputCol)).dataType
-    val td = $(threshold)
-
-    val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 }
-    val binarizerVector = udf { (data: Vector) =>
-      val indices = ArrayBuilder.make[Int]
-      val values = ArrayBuilder.make[Double]
-
-      data.foreachActive { (index, value) =>
-        if (value > td) {
-          indices += index
-          values +=  1.0
+
+    val (inputColName, outputColName, td) = if (isBinarizerMultipleColumns()) {
+      ($(inputCols).toSeq, $(outputCols).toSeq, $(thresholds).toSeq)
+    }
+    else {
+      (Seq($(inputCol)), Seq($(outputCol)), Seq($(threshold)))
+    }
+
+    val inputType = inputColName.map { col => schema(col).dataType }
+
+    val binarizerDouble: Seq[UserDefinedFunction] = td.map {
 
 Review comment:
   I prefer not to create the `binarizerDouble` & `binarizerVector` in the 
beginning, since a part of the entries are never used.
   I prefer initialize new Columns like this:
   ```scala
   val outputCols = inputColNames.zip(outputColNames).zip(thresholds).map{ case 
(inputColName, outputColName, threshold) =>
      schema(inputColName).dataType match{
      ...
     }
   }
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] zhengruifeng commented on a change in pull request #20732: [SPARK-23578][ML] Add multicolumn support for Binarizer

Reply via email to