This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new fe07b21 [SPARK-31400][ML] The catalogString doesn't distinguish
Vectors in ml and mllib
fe07b21 is described below
commit fe07b21b8ab60def6c4451c661e4dd46a4d48b5a
Author: TJX2014 <[email protected]>
AuthorDate: Sun Apr 26 11:35:44 2020 -0500
[SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and
mllib
What changes were proposed in this pull request?
1.Add class info output in
org.apache.spark.ml.util.SchemaUtils#checkColumnType to distinct Vectors in ml
and mllib
2.Add unit test
Why are the changes needed?
the catalogString doesn't distinguish Vectors in ml and mllib when mllib
vector misused in ml
https://issues.apache.org/jira/browse/SPARK-31400
Does this PR introduce any user-facing change?
No
How was this patch tested?
Unit test is added
Closes #28347 from
TJX2014/master-catalogString-distinguish-Vectors-in-ml-and-mllib.
Authored-by: TJX2014 <[email protected]>
Signed-off-by: Sean Owen <[email protected]>
---
.../org/apache/spark/ml/util/SchemaUtils.scala | 4 ++--
.../apache/spark/mllib/util/TestingUtilsSuite.scala | 21 ++++++++++++++++++++-
2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 752069d..c08d7e8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -42,8 +42,8 @@ private[spark] object SchemaUtils {
val actualDataType = schema(colName).dataType
val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
require(actualDataType.equals(dataType),
- s"Column $colName must be of type ${dataType.catalogString} but was
actually " +
- s"${actualDataType.catalogString}.$message")
+ s"Column $colName must be of type
${dataType.getClass}:${dataType.catalogString} " +
+ s"but was actually
${actualDataType.getClass}:${actualDataType.catalogString}.$message")
}
/**
diff --git
a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
index 3fcf1cf..bc80e86 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.mllib.util
import org.scalatest.exceptions.TestFailedException
import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.TestingUtils._
-
+import org.apache.spark.sql.types.{StructField, StructType}
class TestingUtilsSuite extends SparkFunSuite {
test("Comparing doubles using relative error.") {
@@ -457,4 +459,21 @@ class TestingUtilsSuite extends SparkFunSuite {
assert(Matrices.sparse(2, 2, Array(0, 1, 2), Array(0, 1), Array(3.1, 3.5))
!~=
Matrices.dense(0, 0, Array()) relTol 0.01)
}
+
+ test("SPARK-31400, catalogString distinguish Vectors in ml and mllib") {
+ val schema = StructType(Array[StructField] {
+ StructField("features", new org.apache.spark.mllib.linalg.VectorUDT)
+ })
+ val e = intercept[IllegalArgumentException] {
+ SchemaUtils.checkColumnType(schema, "features", new VectorUDT)
+ }
+ assert(e.getMessage.contains(
+
"org.apache.spark.mllib.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>"),
+ "dataType is not desired")
+
+ val normalSchema = StructType(Array[StructField] {
+ StructField("features", new VectorUDT)
+ })
+ SchemaUtils.checkColumnType(normalSchema, "features", new VectorUDT)
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]