Re: [PR] [SPARK-49916][SQL] Throw appropriate Exception for type mismatch between ColumnType and data type in some rows [spark]

via GitHub Tue, 15 Oct 2024 05:15:59 -0700


MaxGekk commented on code in PR #48397:
URL: https://github.com/apache/spark/pull/48397#discussion_r1801044046



##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala:
##########
@@ -65,9 +65,98 @@ class PostgresIntegrationSuite extends 
DockerJDBCIntegrationV2Suite with V2JDBCT
          |)
                    """.stripMargin
     ).executeUpdate()
-    connection.prepareStatement(
-      "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+
+    connection.prepareStatement("CREATE TABLE array_test_table (int_array 
int[]," +
+      "float_array FLOAT8[], timestamp_array TIMESTAMP[], string_array 
TEXT[]," +
+      "datetime_array TIMESTAMPTZ[], array_of_int_arrays 
INT[][])").executeUpdate()
+
+    val query =
+      """
+        INSERT INTO array_test_table
+        (int_array, float_array, timestamp_array, string_array,
+        datetime_array, array_of_int_arrays)
+        VALUES
+        (
+            ARRAY[1, 2, 3],                       -- Array of integers
+            ARRAY[1.1, 2.2, 3.3],                 -- Array of floats
+            ARRAY['2023-01-01 12:00'::timestamp, '2023-06-01 
08:30'::timestamp],
+            ARRAY['hello', 'world'],              -- Array of strings
+            ARRAY['2023-10-04 12:00:00+00'::timestamptz,
+            '2023-12-01 14:15:00+00'::timestamptz],
+            ARRAY[ARRAY[1, 2]]    -- Array of arrays of integers
+        ),
+        (
+            ARRAY[10, 20, 30],                    -- Another set of data
+            ARRAY[10.5, 20.5, 30.5],
+            ARRAY['2022-01-01 09:15'::timestamp, '2022-03-15 
07:45'::timestamp],
+            ARRAY['postgres', 'arrays'],
+            ARRAY['2022-11-22 09:00:00+00'::timestamptz,
+            '2022-12-31 23:59:59+00'::timestamptz],
+            ARRAY[ARRAY[10, 20]]
+        );
+      """
+    connection.prepareStatement(query).executeUpdate()
+
+    connection.prepareStatement("CREATE TABLE array_int (col 
int[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_bigint(col 
bigint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_smallint (col 
smallint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_boolean (col 
boolean[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_float (col 
real[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_double (col 
float8[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamp (col 
timestamp[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamptz (col 
timestamptz[])")
+      .executeUpdate()
+
+    connection.prepareStatement("INSERT INTO array_int VALUES 
(array[array[10]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_bigint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_smallint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_boolean VALUES 
(array[array[true]])")
       .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_float VALUES 
(array[array[10.5]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_double VALUES 
(array[array[10.1]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamp VALUES (" +
+      "array[array['2022-01-01 09:15'::timestamp]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamptz VALUES " +
+      "(array[array['2022-01-01 09:15'::timestamptz]])").executeUpdate()
+    connection.prepareStatement(
+    "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+    .executeUpdate()
+  }
+
+  test("Test multi-dimensional column types") {
+    // This test is used to verify that the multi-dimensional
+    // column types are supported by the JDBC V2 data source.
+    // We do not verify any result output

Review Comment:
   Thank you for the comment.



##########
common/utils/src/main/resources/error/error-conditions.json:
##########
@@ -606,6 +606,12 @@
     ],
     "sqlState" : "42711"
   },
+  "COLUMN_ARRAY_ELEMENT_TYPE_MISMATCH" : {
+    "message" : [
+      "Some values in field <pos> are incompatible with the column array type. 
Expected type <type>."
+    ],
+    "sqlState" : "0A000"

Review Comment:
   Please, assign a `sqlState` of the error class `42`.



##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala:
##########
@@ -65,9 +65,98 @@ class PostgresIntegrationSuite extends 
DockerJDBCIntegrationV2Suite with V2JDBCT
          |)
                    """.stripMargin
     ).executeUpdate()
-    connection.prepareStatement(
-      "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+
+    connection.prepareStatement("CREATE TABLE array_test_table (int_array 
int[]," +
+      "float_array FLOAT8[], timestamp_array TIMESTAMP[], string_array 
TEXT[]," +
+      "datetime_array TIMESTAMPTZ[], array_of_int_arrays 
INT[][])").executeUpdate()
+
+    val query =
+      """
+        INSERT INTO array_test_table
+        (int_array, float_array, timestamp_array, string_array,
+        datetime_array, array_of_int_arrays)
+        VALUES
+        (
+            ARRAY[1, 2, 3],                       -- Array of integers
+            ARRAY[1.1, 2.2, 3.3],                 -- Array of floats
+            ARRAY['2023-01-01 12:00'::timestamp, '2023-06-01 
08:30'::timestamp],
+            ARRAY['hello', 'world'],              -- Array of strings
+            ARRAY['2023-10-04 12:00:00+00'::timestamptz,
+            '2023-12-01 14:15:00+00'::timestamptz],
+            ARRAY[ARRAY[1, 2]]    -- Array of arrays of integers
+        ),
+        (
+            ARRAY[10, 20, 30],                    -- Another set of data
+            ARRAY[10.5, 20.5, 30.5],
+            ARRAY['2022-01-01 09:15'::timestamp, '2022-03-15 
07:45'::timestamp],
+            ARRAY['postgres', 'arrays'],
+            ARRAY['2022-11-22 09:00:00+00'::timestamptz,
+            '2022-12-31 23:59:59+00'::timestamptz],
+            ARRAY[ARRAY[10, 20]]
+        );
+      """
+    connection.prepareStatement(query).executeUpdate()
+
+    connection.prepareStatement("CREATE TABLE array_int (col 
int[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_bigint(col 
bigint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_smallint (col 
smallint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_boolean (col 
boolean[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_float (col 
real[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_double (col 
float8[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamp (col 
timestamp[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamptz (col 
timestamptz[])")
+      .executeUpdate()
+
+    connection.prepareStatement("INSERT INTO array_int VALUES 
(array[array[10]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_bigint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_smallint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_boolean VALUES 
(array[array[true]])")
       .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_float VALUES 
(array[array[10.5]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_double VALUES 
(array[array[10.1]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamp VALUES (" +
+      "array[array['2022-01-01 09:15'::timestamp]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamptz VALUES " +
+      "(array[array['2022-01-01 09:15'::timestamptz]])").executeUpdate()
+    connection.prepareStatement(
+    "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+    .executeUpdate()
+  }
+
+  test("Test multi-dimensional column types") {
+    // This test is used to verify that the multi-dimensional
+    // column types are supported by the JDBC V2 data source.
+    // We do not verify any result output
+    //
+    val df = spark.read.format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "array_test_table")
+      .load()
+    df.collect()
+
+    val array_tables = Seq("array_int", "array_bigint", "array_smallint",
+      "array_boolean", "array_float", "array_double", "array_timestamp",
+      "array_timestamptz")
+
+    array_tables.foreach {
+      dbtable =>
+        checkError(
+          exception = intercept[SparkSQLException] {
+            val df = spark.read.format("jdbc")
+              .option("url", jdbcUrl)
+              .option("dbtable", dbtable)
+              .load()
+            df.collect()
+          },
+          condition = "COLUMN_ARRAY_ELEMENT_TYPE_MISMATCH",
+          parameters = Map("pos" -> "0", "type" -> "array"),

Review Comment:
   I would expect `toSQLType` format the type in upper case.



##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala:
##########
@@ -65,9 +65,98 @@ class PostgresIntegrationSuite extends 
DockerJDBCIntegrationV2Suite with V2JDBCT
          |)
                    """.stripMargin
     ).executeUpdate()
-    connection.prepareStatement(
-      "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+
+    connection.prepareStatement("CREATE TABLE array_test_table (int_array 
int[]," +
+      "float_array FLOAT8[], timestamp_array TIMESTAMP[], string_array 
TEXT[]," +
+      "datetime_array TIMESTAMPTZ[], array_of_int_arrays 
INT[][])").executeUpdate()
+
+    val query =
+      """
+        INSERT INTO array_test_table
+        (int_array, float_array, timestamp_array, string_array,
+        datetime_array, array_of_int_arrays)
+        VALUES
+        (
+            ARRAY[1, 2, 3],                       -- Array of integers
+            ARRAY[1.1, 2.2, 3.3],                 -- Array of floats
+            ARRAY['2023-01-01 12:00'::timestamp, '2023-06-01 
08:30'::timestamp],
+            ARRAY['hello', 'world'],              -- Array of strings
+            ARRAY['2023-10-04 12:00:00+00'::timestamptz,
+            '2023-12-01 14:15:00+00'::timestamptz],
+            ARRAY[ARRAY[1, 2]]    -- Array of arrays of integers
+        ),
+        (
+            ARRAY[10, 20, 30],                    -- Another set of data
+            ARRAY[10.5, 20.5, 30.5],
+            ARRAY['2022-01-01 09:15'::timestamp, '2022-03-15 
07:45'::timestamp],
+            ARRAY['postgres', 'arrays'],
+            ARRAY['2022-11-22 09:00:00+00'::timestamptz,
+            '2022-12-31 23:59:59+00'::timestamptz],
+            ARRAY[ARRAY[10, 20]]
+        );
+      """
+    connection.prepareStatement(query).executeUpdate()
+
+    connection.prepareStatement("CREATE TABLE array_int (col 
int[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_bigint(col 
bigint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_smallint (col 
smallint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_boolean (col 
boolean[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_float (col 
real[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_double (col 
float8[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamp (col 
timestamp[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamptz (col 
timestamptz[])")
+      .executeUpdate()
+
+    connection.prepareStatement("INSERT INTO array_int VALUES 
(array[array[10]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_bigint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_smallint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_boolean VALUES 
(array[array[true]])")
       .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_float VALUES 
(array[array[10.5]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_double VALUES 
(array[array[10.1]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamp VALUES (" +
+      "array[array['2022-01-01 09:15'::timestamp]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamptz VALUES " +
+      "(array[array['2022-01-01 09:15'::timestamptz]])").executeUpdate()
+    connection.prepareStatement(
+    "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+    .executeUpdate()
+  }
+
+  test("Test multi-dimensional column types") {
+    // This test is used to verify that the multi-dimensional
+    // column types are supported by the JDBC V2 data source.
+    // We do not verify any result output
+    //
+    val df = spark.read.format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "array_test_table")
+      .load()
+    df.collect()
+
+    val array_tables = Seq("array_int", "array_bigint", "array_smallint",
+      "array_boolean", "array_float", "array_double", "array_timestamp",
+      "array_timestamptz")
+
+    array_tables.foreach {
+      dbtable =>

Review Comment:
   ```suggestion
       array_tables.foreach { dbtable =>
   ```
   Please, look at examples in the style guide: 
https://github.com/databricks/scala-style-guide?tab=readme-ov-file#anonymous-methods



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-49916][SQL] Throw appropriate Exception for type mismatch between ColumnType and data type in some rows [spark]

Reply via email to