[spark] branch master updated: [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib

srowen Sun, 26 Apr 2020 09:37:43 -0700

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new fe07b21  [SPARK-31400][ML] The catalogString doesn't distinguish 
Vectors in ml and mllib
fe07b21 is described below

commit fe07b21b8ab60def6c4451c661e4dd46a4d48b5a
Author: TJX2014 <[email protected]>
AuthorDate: Sun Apr 26 11:35:44 2020 -0500

    [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and 
mllib
    
    What changes were proposed in this pull request?
    1.Add class info output in 
org.apache.spark.ml.util.SchemaUtils#checkColumnType to distinct Vectors in ml 
and mllib
    2.Add unit test
    
    Why are the changes needed?
    the catalogString doesn't distinguish Vectors in ml and mllib when mllib 
vector misused in ml
    https://issues.apache.org/jira/browse/SPARK-31400
    
    Does this PR introduce any user-facing change?
    No
    
    How was this patch tested?
    Unit test is added
    
    Closes #28347 from 
TJX2014/master-catalogString-distinguish-Vectors-in-ml-and-mllib.
    
    Authored-by: TJX2014 <[email protected]>
    Signed-off-by: Sean Owen <[email protected]>
---
 .../org/apache/spark/ml/util/SchemaUtils.scala      |  4 ++--
 .../apache/spark/mllib/util/TestingUtilsSuite.scala | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala 
b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 752069d..c08d7e8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -42,8 +42,8 @@ private[spark] object SchemaUtils {
     val actualDataType = schema(colName).dataType
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(actualDataType.equals(dataType),
-      s"Column $colName must be of type ${dataType.catalogString} but was 
actually " +
-        s"${actualDataType.catalogString}.$message")
+      s"Column $colName must be of type 
${dataType.getClass}:${dataType.catalogString} " +
+        s"but was actually 
${actualDataType.getClass}:${actualDataType.catalogString}.$message")
   }
 
   /**
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
index 3fcf1cf..bc80e86 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.mllib.util
 import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.util.SchemaUtils
 import org.apache.spark.mllib.linalg.{Matrices, Vectors}
 import org.apache.spark.mllib.util.TestingUtils._
-
+import org.apache.spark.sql.types.{StructField, StructType}
 class TestingUtilsSuite extends SparkFunSuite {
 
   test("Comparing doubles using relative error.") {
@@ -457,4 +459,21 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(Matrices.sparse(2, 2, Array(0, 1, 2), Array(0, 1), Array(3.1, 3.5)) 
!~=
       Matrices.dense(0, 0, Array()) relTol 0.01)
   }
+
+  test("SPARK-31400, catalogString distinguish Vectors in ml and mllib") {
+    val schema = StructType(Array[StructField] {
+      StructField("features", new org.apache.spark.mllib.linalg.VectorUDT)
+    })
+    val e = intercept[IllegalArgumentException] {
+      SchemaUtils.checkColumnType(schema, "features", new VectorUDT)
+    }
+    assert(e.getMessage.contains(
+      
"org.apache.spark.mllib.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>"),
+      "dataType is not desired")
+
+    val normalSchema = StructType(Array[StructField] {
+      StructField("features", new VectorUDT)
+    })
+    SchemaUtils.checkColumnType(normalSchema, "features", new VectorUDT)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib

Reply via email to