Re: [PR] [SPARK-49916][SQL] Throw appropriate Exception for type mismatch between ColumnType and data type in some rows [spark]

via GitHub Mon, 14 Oct 2024 07:04:53 -0700


MaxGekk commented on code in PR #48397:
URL: https://github.com/apache/spark/pull/48397#discussion_r1799554487



##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala:
##########
@@ -65,9 +65,139 @@ class PostgresIntegrationSuite extends 
DockerJDBCIntegrationV2Suite with V2JDBCT
          |)
                    """.stripMargin
     ).executeUpdate()
-    connection.prepareStatement(
-      "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+
+    connection.prepareStatement("CREATE TABLE array_test_table (int_array 
int[]," +
+      "float_array FLOAT8[], timestamp_array TIMESTAMP[], string_array 
TEXT[]," +
+      "datetime_array TIMESTAMPTZ[], array_of_int_arrays 
INT[][])").executeUpdate()
+
+    val query =
+      """
+        INSERT INTO array_test_table
+        (int_array, float_array, timestamp_array, string_array,
+        datetime_array, array_of_int_arrays)
+        VALUES
+        (
+            ARRAY[1, 2, 3],                       -- Array of integers
+            ARRAY[1.1, 2.2, 3.3],                 -- Array of floats
+            ARRAY['2023-01-01 12:00'::timestamp, '2023-06-01 
08:30'::timestamp],
+            ARRAY['hello', 'world'],              -- Array of strings
+            ARRAY['2023-10-04 12:00:00+00'::timestamptz,
+            '2023-12-01 14:15:00+00'::timestamptz],
+            ARRAY[ARRAY[1, 2]]    -- Array of arrays of integers
+        ),
+        (
+            ARRAY[10, 20, 30],                    -- Another set of data
+            ARRAY[10.5, 20.5, 30.5],
+            ARRAY['2022-01-01 09:15'::timestamp, '2022-03-15 
07:45'::timestamp],
+            ARRAY['postgres', 'arrays'],
+            ARRAY['2022-11-22 09:00:00+00'::timestamptz,
+            '2022-12-31 23:59:59+00'::timestamptz],
+            ARRAY[ARRAY[10, 20]]
+        );
+      """
+    connection.prepareStatement(query).executeUpdate()
+
+    connection.prepareStatement("CREATE TABLE array_int (col 
int[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_bigint(col 
bigint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_smallint (col 
smallint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_boolean (col 
boolean[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_float (col 
real[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_double (col 
float8[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamp (col 
timestamp[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamptz (col 
timestamptz[])")
+      .executeUpdate()
+
+    connection.prepareStatement("INSERT INTO array_int VALUES 
(array[array[10]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_bigint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_smallint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_boolean VALUES 
(array[array[true]])")
       .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_float VALUES 
(array[array[10.5]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_double VALUES 
(array[array[10.1]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamp VALUES (" +
+      "array[array['2022-01-01 09:15'::timestamp]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamptz VALUES " +
+      "(array[array['2022-01-01 09:15'::timestamptz]])").executeUpdate()
+    connection.prepareStatement(
+    "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+    .executeUpdate()
+  }
+
+  test("Test multi-dimensional column types") {
+    val df = spark.read.format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "array_test_table")
+      .load()
+    df.collect()
+
+
+    intercept[SparkSQLException] {
+      val df = spark.read.format("jdbc")
+        .option("url", jdbcUrl)
+        .option("dbtable", "array_int")
+        .load()
+      df.collect()
+    }
+
+    intercept[SparkSQLException] {

Review Comment:
   Could you deduplicate the code like:
   ```scala
       Seq("array_int", "array_bigint").foreach { dbtable =>
         intercept[SparkSQLException] {
           spark.read.format("jdbc")
             .option("url", jdbcUrl)
             .option("dbtable", dbtable)
             .load()
             .collect()
         }
       }
   ```



##########
common/utils/src/main/resources/error/error-conditions.json:
##########
@@ -606,6 +606,12 @@
     ],
     "sqlState" : "42711"
   },
+  "COLUMN_ARRAY_ELEMENT_TYPE_MISMATCH" : {
+    "message" : [
+      "Some values in field <pos> are incompatible with the column array type. 
Expected type <type>."
+    ],
+    "sqlState" : "0A000"

Review Comment:
   Why did you select this error class:
   ```
       "0A": "feature not supported",
   ```
   Please, consider another one, for instance some sqlState with the error 
class `42`



##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala:
##########
@@ -65,9 +65,139 @@ class PostgresIntegrationSuite extends 
DockerJDBCIntegrationV2Suite with V2JDBCT
          |)
                    """.stripMargin
     ).executeUpdate()
-    connection.prepareStatement(
-      "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+
+    connection.prepareStatement("CREATE TABLE array_test_table (int_array 
int[]," +
+      "float_array FLOAT8[], timestamp_array TIMESTAMP[], string_array 
TEXT[]," +
+      "datetime_array TIMESTAMPTZ[], array_of_int_arrays 
INT[][])").executeUpdate()
+
+    val query =
+      """
+        INSERT INTO array_test_table
+        (int_array, float_array, timestamp_array, string_array,
+        datetime_array, array_of_int_arrays)
+        VALUES
+        (
+            ARRAY[1, 2, 3],                       -- Array of integers
+            ARRAY[1.1, 2.2, 3.3],                 -- Array of floats
+            ARRAY['2023-01-01 12:00'::timestamp, '2023-06-01 
08:30'::timestamp],
+            ARRAY['hello', 'world'],              -- Array of strings
+            ARRAY['2023-10-04 12:00:00+00'::timestamptz,
+            '2023-12-01 14:15:00+00'::timestamptz],
+            ARRAY[ARRAY[1, 2]]    -- Array of arrays of integers
+        ),
+        (
+            ARRAY[10, 20, 30],                    -- Another set of data
+            ARRAY[10.5, 20.5, 30.5],
+            ARRAY['2022-01-01 09:15'::timestamp, '2022-03-15 
07:45'::timestamp],
+            ARRAY['postgres', 'arrays'],
+            ARRAY['2022-11-22 09:00:00+00'::timestamptz,
+            '2022-12-31 23:59:59+00'::timestamptz],
+            ARRAY[ARRAY[10, 20]]
+        );
+      """
+    connection.prepareStatement(query).executeUpdate()
+
+    connection.prepareStatement("CREATE TABLE array_int (col 
int[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_bigint(col 
bigint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_smallint (col 
smallint[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_boolean (col 
boolean[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_float (col 
real[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_double (col 
float8[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamp (col 
timestamp[])").executeUpdate()
+    connection.prepareStatement("CREATE TABLE array_timestamptz (col 
timestamptz[])")
+      .executeUpdate()
+
+    connection.prepareStatement("INSERT INTO array_int VALUES 
(array[array[10]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_bigint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_smallint VALUES 
(array[array[10]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_boolean VALUES 
(array[array[true]])")
       .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_float VALUES 
(array[array[10.5]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_double VALUES 
(array[array[10.1]])")
+      .executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamp VALUES (" +
+      "array[array['2022-01-01 09:15'::timestamp]])").executeUpdate()
+    connection.prepareStatement("INSERT INTO array_timestamptz VALUES " +
+      "(array[array['2022-01-01 09:15'::timestamptz]])").executeUpdate()
+    connection.prepareStatement(
+    "CREATE TABLE datetime (name VARCHAR(32), date1 DATE, time1 TIMESTAMP)")
+    .executeUpdate()
+  }
+
+  test("Test multi-dimensional column types") {
+    val df = spark.read.format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "array_test_table")
+      .load()
+    df.collect()

Review Comment:
   What do you check here? Let's check the result.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-49916][SQL] Throw appropriate Exception for type mismatch between ColumnType and data type in some rows [spark]

Reply via email to