Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20970#discussion_r178938415
  
    --- Diff: 
mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala ---
    @@ -592,4 +593,26 @@ class RFormulaSuite extends MLTest with 
DefaultReadWriteTest {
             assert(features.toArray === a +: b.toArray)
         }
       }
    +
    +  test("SPARK-23562 RFormula handleInvalid should handle invalid values in 
non-string columns.") {
    +    val d1 = Seq(
    +      (1001L, "a"),
    +      (1002L, "b")).toDF("id1", "c1")
    +    val d2 = Seq[(java.lang.Long, String)](
    +      (20001L, "x"),
    +      (20002L, "y"),
    +      (null, null)).toDF("id2", "c2")
    +    val dataset = d1.crossJoin(d2)
    +
    +    def get_output(mode: String): DataFrame = {
    +      val formula = new RFormula().setFormula("c1 ~ 
id2").setHandleInvalid(mode)
    +      formula.fit(dataset).transform(dataset).select("features", "label")
    +    }
    +
    +    intercept[SparkException](get_output("error").collect())
    +      .getMessage contains "Encountered null while assembling a row"
    --- End diff --
    
    We try to avoid using infix notation.  (We tend towards Java-like use of 
Scala for simplicity.)


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to