[GitHub] [spark] Hisoka-X commented on a diff in pull request #42802: [SPARK-43752][SQL] Support default column value on DataSource V2

via GitHub Wed, 06 Sep 2023 19:48:22 -0700


Hisoka-X commented on code in PR #42802:
URL: https://github.com/apache/spark/pull/42802#discussion_r1314901375



##########
sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala:
##########
@@ -1758,169 +1095,6 @@ class InsertSuite extends DataSourceTest with 
SharedSparkSession {
     }
   }
 
-  test("INSERT rows, ALTER TABLE ADD COLUMNS with DEFAULTs, then SELECT them") 
{
-    case class Config(
-        sqlConf: Option[(String, String)],
-        useDataFrames: Boolean = false)
-    def runTest(dataSource: String, config: Config): Unit = {
-      def insertIntoT(): Unit = {
-        sql("insert into t(a, i) values('xyz', 42)")
-      }
-      def withTableT(f: => Unit): Unit = {
-        sql(s"create table t(a string, i int) using $dataSource")
-        insertIntoT
-        withTable("t") { f }
-      }
-      // Positive tests:
-      // Adding a column with a valid default value into a table containing 
existing data works
-      // successfully. Querying data from the altered table returns the new 
value.
-      withTableT {
-        sql("alter table t add column (s string default concat('abc', 'def'))")
-        checkAnswer(spark.table("t"), Row("xyz", 42, "abcdef"))
-        checkAnswer(sql("select i, s from t"), Row(42, "abcdef"))
-        // Now alter the column to change the default value. This still 
returns the previous value,
-        // not the new value, since the behavior semantics are the same as if 
the first command had
-        // performed a backfill of the new default value in the existing rows.
-        sql("alter table t alter column s set default concat('ghi', 'jkl')")
-        checkAnswer(sql("select i, s from t"), Row(42, "abcdef"))
-      }
-      // Adding a column with a default value and then inserting explicit NULL 
values works.
-      // Querying data back from the table differentiates between the explicit 
NULL values and
-      // default values.
-      withTableT {
-        sql("alter table t add column (s string default concat('abc', 'def'))")
-        if (config.useDataFrames) {
-          Seq((null, null, null)).toDF.write.insertInto("t")
-        } else {
-          sql("insert into t values(null, null, null)")
-        }
-        sql("alter table t add column (x boolean default true)")
-        val insertedSColumn = null
-        checkAnswer(spark.table("t"),
-          Seq(
-            Row("xyz", 42, "abcdef", true),
-            Row(null, null, insertedSColumn, true)))
-        checkAnswer(sql("select i, s, x from t"),
-          Seq(
-            Row(42, "abcdef", true),
-            Row(null, insertedSColumn, true)))
-      }
-      // Adding two columns where only the first has a valid default value 
works successfully.
-      // Querying data from the altered table returns the default value as 
well as NULL for the
-      // second column.
-      withTableT {
-        sql("alter table t add column (s string default concat('abc', 'def'))")
-        sql("alter table t add column (x string)")
-        checkAnswer(spark.table("t"), Row("xyz", 42, "abcdef", null))
-        checkAnswer(sql("select i, s, x from t"), Row(42, "abcdef", null))
-      }
-      // Test other supported data types.
-      withTableT {
-        sql("alter table t add columns (" +
-          "s boolean default true, " +
-          "t byte default cast(null as byte), " +
-          "u short default cast(42 as short), " +
-          "v float default 0, " +
-          "w double default 0, " +
-          "x date default cast('2021-01-02' as date), " +
-          "y timestamp default cast('2021-01-02 01:01:01' as timestamp), " +
-          "z timestamp_ntz default cast('2021-01-02 01:01:01' as 
timestamp_ntz), " +
-          "a1 timestamp_ltz default cast('2021-01-02 01:01:01' as 
timestamp_ltz), " +
-          "a2 decimal(5, 2) default 123.45," +
-          "a3 bigint default 43," +
-          "a4 smallint default cast(5 as smallint)," +
-          "a5 tinyint default cast(6 as tinyint))")
-        insertIntoT()
-        // Manually inspect the result row values rather than using the 
'checkAnswer' helper method
-        // in order to ensure the values' correctness while avoiding minor 
type incompatibilities.
-        val result: Array[Row] =
-          sql("select s, t, u, v, w, x, y, z, a1, a2, a3, a4, a5 from 
t").collect()
-        for (row <- result) {
-          assert(row.length == 13)
-          assert(row(0) == true)
-          assert(row(1) == null)
-          assert(row(2) == 42)
-          assert(row(3) == 0.0f)
-          assert(row(4) == 0.0d)
-          assert(row(5).toString == "2021-01-02")
-          assert(row(6).toString == "2021-01-02 01:01:01.0")
-          assert(row(7).toString.startsWith("2021-01-02"))
-          assert(row(8).toString == "2021-01-02 01:01:01.0")
-          assert(row(9).toString == "123.45")
-          assert(row(10) == 43L)
-          assert(row(11) == 5)
-          assert(row(12) == 6)
-        }
-      }
-    }
-
-    // This represents one test configuration over a data source.
-    case class TestCase(
-        dataSource: String,
-        configs: Seq[Config])
-    // Run the test several times using each configuration.
-    Seq(
-      TestCase(
-        dataSource = "csv",
-        Seq(
-          Config(
-            None),
-          Config(
-            Some(SQLConf.CSV_PARSER_COLUMN_PRUNING.key -> "false")))),
-      TestCase(
-        dataSource = "json",
-        Seq(
-          Config(
-            None),
-          Config(
-            Some(SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key -> "false")))),
-      TestCase(
-        dataSource = "orc",
-        Seq(
-          Config(
-            None),
-          Config(
-            Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false")))),
-      TestCase(
-        dataSource = "parquet",
-        Seq(
-          Config(
-            None),
-          Config(
-            Some(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false"))))
-    ).foreach { testCase: TestCase =>
-      testCase.configs.foreach { config: Config =>
-        // Run the test twice, once using SQL for the INSERT operations and 
again using DataFrames.
-        for (useDataFrames <- Seq(false, true)) {
-          config.sqlConf.map { kv: (String, String) =>
-            withSQLConf(kv) {
-              // Run the test with the pair of custom SQLConf values.
-              runTest(testCase.dataSource, config.copy(useDataFrames = 
useDataFrames))
-            }
-          }.getOrElse {
-            // Run the test with default settings.
-            runTest(testCase.dataSource, config.copy(useDataFrames = 
useDataFrames))
-          }
-        }
-      }
-    }
-  }
-
-  test("SPARK-39985 Enable implicit DEFAULT column values in inserts from 
DataFrames") {
-    // Negative test: explicit column "default" references are not supported 
in write operations
-    // from DataFrames: since the operators are resolved one-by-one, any 
.select referring to
-    // "default" generates a "column not found" error before any following 
.insertInto.
-    withTable("t") {
-      sql(s"create table t(a string, i int default 42) using parquet")
-      checkError(
-        exception = intercept[AnalysisException] {
-          Seq("xyz").toDF.select("value", "default").write.insertInto("t")
-        },
-        errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION",
-        parameters = Map("objectName" -> "`default`", "proposal" -> "`value`"))
-    }
-  }
-
   test("SPARK-40001 JSON DEFAULT columns = 
JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE off") {

Review Comment:
   JSON, Array, Struct, Map default value work not fine on DS V2. Fix it later.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] Hisoka-X commented on a diff in pull request #42802: [SPARK-43752][SQL] Support default column value on DataSource V2

Reply via email to