[GitHub] [spark] cloud-fan commented on a change in pull request #30412: [SPARK-33480][SQL] Support char/varchar type

GitBox Mon, 23 Nov 2020 09:36:53 -0800


cloud-fan commented on a change in pull request #30412:
URL: https://github.com/apache/spark/pull/30412#discussion_r528880969




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.sql.connector.InMemoryTableCatalog
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
+
+trait CharVarcharTestSuite extends QueryTest with SQLTestUtils {
+
+  def format: String
+
+  test("char type values should be padded: top-level columns") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
+      sql("INSERT INTO t VALUES ('1', 'a')")
+      checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+    }
+  }
+
+  test("char type values should be padded: partitioned columns") {
+    // DS V2 doesn't support partitioned table.
+    if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) {
+      withTable("t") {
+        sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY 
(c)")
+        sql("INSERT INTO t VALUES ('1', 'a')")
+        checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+      }
+    }
+  }
+
+  test("char type values should be padded: nested in struct") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c STRUCT<c: CHAR(5)>) USING $format")
+      sql("INSERT INTO t VALUES ('1', struct('a'))")
+      checkAnswer(spark.table("t"), Row("1", Row("a" + " " * 4)))
+    }
+  }
+
+  test("char type values should be padded: nested in array") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c ARRAY<CHAR(5)>) USING $format")
+      sql("INSERT INTO t VALUES ('1', array('a', 'ab'))")
+      checkAnswer(spark.table("t"), Row("1", Seq("a" + " " * 4, "ab" + " " * 
3)))
+    }
+  }
+
+  test("char type values should be padded: nested in map key") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c MAP<CHAR(5), STRING>) USING $format")
+      sql("INSERT INTO t VALUES ('1', map('a', 'ab'))")
+      checkAnswer(spark.table("t"), Row("1", Map(("a" + " " * 4, "ab"))))
+    }
+  }
+
+  test("char type values should be padded: nested in map value") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c MAP<STRING, CHAR(5)>) USING $format")
+      sql("INSERT INTO t VALUES ('1', map('a', 'ab'))")
+      checkAnswer(spark.table("t"), Row("1", Map(("a", "ab" + " " * 3))))
+    }
+  }
+
+  test("char type values should be padded: nested in both map key and value") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c MAP<CHAR(5), CHAR(10)>) USING $format")
+      sql("INSERT INTO t VALUES ('1', map('a', 'ab'))")
+      checkAnswer(spark.table("t"), Row("1", Map(("a" + " " * 4, "ab" + " " * 
8))))
+    }
+  }
+
+  test("char type values should be padded: nested in struct of array") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c STRUCT<c: ARRAY<CHAR(5)>>) USING 
$format")
+      sql("INSERT INTO t VALUES ('1', struct(array('a', 'ab')))")
+      checkAnswer(spark.table("t"), Row("1", Row(Seq("a" + " " * 4, "ab" + " " 
* 3))))
+    }
+  }
+
+  test("char type values should be padded: nested in array of struct") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c ARRAY<STRUCT<c: CHAR(5)>>) USING 
$format")
+      sql("INSERT INTO t VALUES ('1', array(struct('a'), struct('ab')))")
+      checkAnswer(spark.table("t"), Row("1", Seq(Row("a" + " " * 4), Row("ab" 
+ " " * 3))))
+    }
+  }
+
+  test("char type values should be padded: nested in array of array") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c ARRAY<ARRAY<CHAR(5)>>) USING $format")
+      sql("INSERT INTO t VALUES ('1', array(array('a', 'ab')))")
+      checkAnswer(spark.table("t"), Row("1", Seq(Seq("a" + " " * 4, "ab" + " " 
* 3))))
+    }
+  }
+
+  private def testTableWrite(f: String => Unit): Unit = {
+    withTable("t") { f("char") }
+    withTable("t") { f("varchar") }
+  }
+
+  test("length check for input string values: top-level columns") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c $typeName(5)) USING $format")
+      sql("INSERT INTO t VALUES (null)")
+      checkAnswer(spark.table("t"), Row(null))
+      val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: partitioned columns") {
+    // DS V2 doesn't support partitioned table.
+    if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) {
+      testTableWrite { typeName =>
+        sql(s"CREATE TABLE t(i INT, c $typeName(5)) USING $format PARTITIONED 
BY (c)")
+        sql("INSERT INTO t VALUES (1, null)")
+        checkAnswer(spark.table("t"), Row(1, null))
+        val e = intercept[SparkException](sql("INSERT INTO t VALUES (1, 
'123456')"))
+        assert(e.getCause.getMessage.contains(
+          s"input string '123456' exceeds $typeName type length limitation: 
5"))
+      }
+    }
+  }
+
+  test("length check for input string values: nested in struct") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c STRUCT<c: $typeName(5)>) USING $format")
+      sql("INSERT INTO t SELECT struct(null)")
+      checkAnswer(spark.table("t"), Row(Row(null)))
+      val e = intercept[SparkException](sql("INSERT INTO t SELECT 
struct('123456')"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in array") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c ARRAY<$typeName(5)>) USING $format")
+      sql("INSERT INTO t VALUES (array(null))")
+      checkAnswer(spark.table("t"), Row(Seq(null)))
+      val e = intercept[SparkException](sql("INSERT INTO t VALUES (array('a', 
'123456'))"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in map key") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c MAP<$typeName(5), STRING>) USING $format")
+      val e = intercept[SparkException](sql("INSERT INTO t VALUES 
(map('123456', 'a'))"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in map value") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c MAP<STRING, $typeName(5)>) USING $format")
+      sql("INSERT INTO t VALUES (map('a', null))")
+      checkAnswer(spark.table("t"), Row(Map("a" -> null)))
+      val e = intercept[SparkException](sql("INSERT INTO t VALUES (map('a', 
'123456'))"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in both map key and 
value") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c MAP<$typeName(5), $typeName(5)>) USING $format")
+      val e1 = intercept[SparkException](sql("INSERT INTO t VALUES 
(map('123456', 'a'))"))
+      assert(e1.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+      val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (map('a', 
'123456'))"))
+      assert(e2.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in struct of array") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+      sql("INSERT INTO t SELECT struct(array(null))")
+      checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+      val e = intercept[SparkException](sql("INSERT INTO t SELECT 
struct(array('123456'))"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in array of struct") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c ARRAY<STRUCT<c: $typeName(5)>>) USING $format")
+      sql("INSERT INTO t VALUES (array(struct(null)))")
+      checkAnswer(spark.table("t"), Row(Seq(Row(null))))
+      val e = intercept[SparkException](sql("INSERT INTO t VALUES 
(array(struct('123456')))"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: nested in array of array") {
+    testTableWrite { typeName =>
+      sql(s"CREATE TABLE t(c ARRAY<ARRAY<$typeName(5)>>) USING $format")
+      sql("INSERT INTO t VALUES (array(array(null)))")
+      checkAnswer(spark.table("t"), Row(Seq(Seq(null))))
+      val e = intercept[SparkException](sql("INSERT INTO t VALUES 
(array(array('123456')))"))
+      assert(e.getCause.getMessage.contains(
+        s"input string '123456' exceeds $typeName type length limitation: 5"))
+    }
+  }
+
+  test("length check for input string values: with trailing spaces") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format")
+      sql("INSERT INTO t VALUES ('12 ', '12 ')")
+      sql("INSERT INTO t VALUES ('1234  ', '1234  ')")
+      checkAnswer(spark.table("t"), Seq(
+        Row("12" + " " * 3, "12 "),
+        Row("1234 ", "1234 ")))
+    }
+  }
+
+  test("length check for input string values: with implicit cast") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format")
+      sql("INSERT INTO t VALUES (1234, 1234)")
+      checkAnswer(spark.table("t"), Row("1234 ", "1234"))
+      val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 
1)"))
+      assert(e1.getCause.getMessage.contains(
+        "input string '123456' exceeds char type length limitation: 5"))
+      val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 
123456)"))
+      assert(e2.getCause.getMessage.contains(
+        "input string '123456' exceeds varchar type length limitation: 5"))
+    }
+  }
+
+  private def testConditions(df: DataFrame, conditions: Seq[(String, 
Boolean)]): Unit = {
+    checkAnswer(df.selectExpr(conditions.map(_._1): _*), 
Row.fromSeq(conditions.map(_._2)))
+  }
+
+  test("char type comparison: top-level columns") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(c1 CHAR(2), c2 CHAR(5)) USING $format")
+      sql("INSERT INTO t VALUES ('a', 'a')")
+      testConditions(spark.table("t"), Seq(
+        ("c1 = 'a'", true),
+        ("'a' = c1", true),
+        ("c1 = 'a  '", true),
+        ("c1 > 'a'", false),
+        ("c1 IN ('a', 'b')", true),
+        ("c1 = c2", true),
+        ("c1 < c2", false),
+        ("c1 IN (c2)", true)))
+    }
+  }
+
+  test("char type comparison: partitioned columns") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i INT, c1 CHAR(2), c2 CHAR(5)) USING $format 
PARTITIONED BY (c1, c2)")
+      sql("INSERT INTO t VALUES (1, 'a', 'a')")
+      testConditions(spark.table("t"), Seq(
+        ("c1 = 'a'", true),
+        ("'a' = c1", true),
+        ("c1 = 'a  '", true),
+        ("c1 > 'a'", false),
+        ("c1 IN ('a', 'b')", true),
+        ("c1 = c2", true),
+        ("c1 < c2", false),
+        ("c1 IN (c2)", true)))
+    }
+  }
+
+  test("char type comparison: join") {
+    withTable("t1", "t2") {
+      sql(s"CREATE TABLE t1(c CHAR(2)) USING $format")
+      sql(s"CREATE TABLE t2(c CHAR(5)) USING $format")
+      sql("INSERT INTO t1 VALUES ('a')")
+      sql("INSERT INTO t2 VALUES ('a')")
+      checkAnswer(sql("SELECT t1.c FROM t1 JOIN t2 ON t1.c = t2.c"), Row("a "))
+    }
+  }
+
+  test("char type comparison: nested in struct") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(c1 STRUCT<c: CHAR(2)>, c2 STRUCT<c: CHAR(5)>) USING 
$format")
+      sql("INSERT INTO t VALUES (struct('a'), struct('a'))")
+      testConditions(spark.table("t"), Seq(
+        ("c1 = c2", true),
+        ("c1 < c2", false),
+        ("c1 IN (c2)", true)))
+    }
+  }
+
+  test("char type comparison: nested in array") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(c1 ARRAY<CHAR(2)>, c2 ARRAY<CHAR(5)>) USING 
$format")
+      sql("INSERT INTO t VALUES (array('a', 'b'), array('a', 'b'))")
+      testConditions(spark.table("t"), Seq(
+        ("c1 = c2", true),
+        ("c1 < c2", false),
+        ("c1 IN (c2)", true)))
+    }
+  }
+
+  test("char type comparison: nested in struct of array") {
+    withTable("t") {
+      sql("CREATE TABLE t(c1 STRUCT<a: ARRAY<CHAR(2)>>, c2 STRUCT<a: 
ARRAY<CHAR(5)>>) " +
+        s"USING $format")
+      sql("INSERT INTO t VALUES (struct(array('a', 'b')), struct(array('a', 
'b')))")
+      testConditions(spark.table("t"), Seq(
+        ("c1 = c2", true),
+        ("c1 < c2", false),
+        ("c1 IN (c2)", true)))
+    }
+  }
+
+  test("char type comparison: nested in array of struct") {
+    withTable("t") {
+      sql("CREATE TABLE t(c1 ARRAY<STRUCT<c: CHAR(2)>>, c2 ARRAY<STRUCT<c: 
CHAR(5)>>) " +
+        s"USING $format")
+      sql("INSERT INTO t VALUES (array(struct('a')), array(struct('a')))")
+      testConditions(spark.table("t"), Seq(
+        ("c1 = c2", true),
+        ("c1 < c2", false),
+        ("c1 IN (c2)", true)))
+    }
+  }
+
+  test("char type comparison: nested in array of array") {

Review comment:
       map type is not comparable.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] cloud-fan commented on a change in pull request #30412: [SPARK-33480][SQL] Support char/varchar type

Reply via email to