[
https://issues.apache.org/jira/browse/HUDI-8911?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Y Ethan Guo updated HUDI-8911:
------------------------------
Description:
The new tests `TestInsertTable`.`Test Insert Into with subset of columns`,
`Test Insert Into with subset of columns on Parquet table` fail on Spark 3.4
due to our validation introduced in HoodieSpark34CatalystPlanUtils in
[https://github.com/apache/hudi/pull/11568]. Without this change, INSERT INTO
with subset of columns used to work.
{code:java}
override def unapplyInsertIntoStatement(plan: LogicalPlan):
Option[(LogicalPlan, Seq[String], Map[String, Option[String]], LogicalPlan,
Boolean, Boolean)] = {
plan match {
case insert: InsertIntoStatement =>
// https://github.com/apache/spark/pull/36077
// first: in this pr, spark34 support default value for insert into, it
will regenerate the user specified cols
// so, no need deal with it in hudi side
// second: in this pr, it will append hoodie meta field with default
value, has some bug, it look like be fixed
// in spark35(https://github.com/apache/spark/pull/41262), so if
user want specified cols, need disable default feature.
if (SQLConf.get.enableDefaultColumns) {
if (insert.userSpecifiedCols.nonEmpty) {
throw new AnalysisException("hudi not support specified cols when
enable default columns, " +
"please disable 'spark.sql.defaultColumn.enabled'")
}
Some((insert.table, Seq.empty, insert.partitionSpec, insert.query,
insert.overwrite, insert.ifPartitionNotExists))
} else {
Some((insert.table, insert.userSpecifiedCols, insert.partitionSpec,
insert.query, insert.overwrite, insert.ifPartitionNotExists))
}
case _ =>
None
}
} {code}
> Support INSERT SQL statement with a subset of columns in Spark 3.4
> ------------------------------------------------------------------
>
> Key: HUDI-8911
> URL: https://issues.apache.org/jira/browse/HUDI-8911
> Project: Apache Hudi
> Issue Type: Bug
> Reporter: Y Ethan Guo
> Priority: Critical
> Fix For: 1.0.2
>
>
> The new tests `TestInsertTable`.`Test Insert Into with subset of columns`,
> `Test Insert Into with subset of columns on Parquet table` fail on Spark 3.4
> due to our validation introduced in HoodieSpark34CatalystPlanUtils in
> [https://github.com/apache/hudi/pull/11568]. Without this change, INSERT
> INTO with subset of columns used to work.
>
> {code:java}
> override def unapplyInsertIntoStatement(plan: LogicalPlan):
> Option[(LogicalPlan, Seq[String], Map[String, Option[String]], LogicalPlan,
> Boolean, Boolean)] = {
> plan match {
> case insert: InsertIntoStatement =>
> // https://github.com/apache/spark/pull/36077
> // first: in this pr, spark34 support default value for insert into, it
> will regenerate the user specified cols
> // so, no need deal with it in hudi side
> // second: in this pr, it will append hoodie meta field with default
> value, has some bug, it look like be fixed
> // in spark35(https://github.com/apache/spark/pull/41262), so
> if user want specified cols, need disable default feature.
> if (SQLConf.get.enableDefaultColumns) {
> if (insert.userSpecifiedCols.nonEmpty) {
> throw new AnalysisException("hudi not support specified cols when
> enable default columns, " +
> "please disable 'spark.sql.defaultColumn.enabled'")
> }
> Some((insert.table, Seq.empty, insert.partitionSpec, insert.query,
> insert.overwrite, insert.ifPartitionNotExists))
> } else {
> Some((insert.table, insert.userSpecifiedCols, insert.partitionSpec,
> insert.query, insert.overwrite, insert.ifPartitionNotExists))
> }
> case _ =>
> None
> }
> } {code}
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)