[GitHub] [spark] cloud-fan commented on a change in pull request #30586: [SPARK-33641][SQL] Invalidate new char-like types in public APIs that produce incorrect results

GitBox Thu, 03 Dec 2020 23:35:54 -0800


cloud-fan commented on a change in pull request #30586:
URL: https://github.com/apache/spark/pull/30586#discussion_r535892966




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -448,42 +445,29 @@ class BasicCharVarcharTestSuite extends QueryTest with 
SharedSparkSession {
     assert(schema.map(_.dataType) == Seq(StringType))
   }
 
-  test("user-specified schema in DataFrameReader: file source from Dataset") {
-    val ds = spark.range(10).map(_.toString)
-    val df1 = spark.read.schema(new StructType().add("id", 
CharType(5))).csv(ds)
-    assert(df1.schema.map(_.dataType) == Seq(StringType))
-    val df2 = spark.read.schema("id char(5)").csv(ds)
-    assert(df2.schema.map(_.dataType) == Seq(StringType))
+  def failWithInvalidCharUsage[T](fn: => T): Unit = {
+    val e = intercept[AnalysisException](fn)
+    assert(e.getMessage contains "Cannot use char/varchar type")
   }
 
-  test("user-specified schema in DataFrameReader: DSV1") {
-    def checkSchema(df: DataFrame): Unit = {
-      val relations = df.queryExecution.analyzed.collect {
-        case l: LogicalRelation => l.relation
-      }
-      assert(relations.length == 1)
-      assert(relations.head.schema.map(_.dataType) == Seq(StringType))
-    }
-
-    checkSchema(spark.read.schema(new StructType().add("id", CharType(5)))
-      .format(classOf[SimpleInsertSource].getName).load())
-    checkSchema(spark.read.schema("id char(5)")
-      .format(classOf[SimpleInsertSource].getName).load())
+  test("invalidate char/varchar in SparkSession createDataframe") {
+    val ds = spark.range(10).toDF()
+    val schema = new StructType().add("id", CharType(5))
+    failWithInvalidCharUsage(spark.createDataFrame(ds.collectAsList(), schema))
+    failWithInvalidCharUsage(spark.createDataFrame(ds.rdd, schema))
+    failWithInvalidCharUsage(spark.createDataFrame(ds.toJavaRDD, schema))
   }
 
-  test("user-specified schema in DataFrameReader: DSV2") {
-    def checkSchema(df: DataFrame): Unit = {
-      val tables = df.queryExecution.analyzed.collect {
-        case d: DataSourceV2Relation => d.table
-      }
-      assert(tables.length == 1)
-      assert(tables.head.schema.map(_.dataType) == Seq(StringType))
-    }
+  test("invalidate char/varchar in spark.read.schema") {
+    failWithInvalidCharUsage(spark.read.schema(new StructType().add("id", 
CharType(5))))
+    failWithInvalidCharUsage(spark.read.schema("id char(5)"))
+  }
 
-    checkSchema(spark.read.schema(new StructType().add("id", CharType(5)))
-      .format(classOf[SchemaRequiredDataSource].getName).load())
-    checkSchema(spark.read.schema("id char(5)")
-      .format(classOf[SchemaRequiredDataSource].getName).load())
+  test("invalidate char/varchar in udf's result type") {
+    spark.udf.register("testchar", () => "B", VarcharType(1))

Review comment:
       it's better to fail when creating the UDF.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] cloud-fan commented on a change in pull request #30586: [SPARK-33641][SQL] Invalidate new char-like types in public APIs that produce incorrect results

Reply via email to