This is an automated email from the ASF dual-hosted git repository. vhs pushed a commit to branch release-1.0.2 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit 67827dfe1b1fb745d5a2ab46eebe83661d1d431f Author: Y Ethan Guo <[email protected]> AuthorDate: Fri Apr 4 22:45:44 2025 -0700 [MINOR] Update javadocs in MergeIntoHoodieTableCommand (#13093) (cherry picked from commit 048c03123956388773f3a529e59494e30c75442f) --- .../hudi/command/MergeIntoHoodieTableCommand.scala | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala index 2551f7cbcf9..f6221abe849 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala @@ -291,8 +291,20 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie } /** - * Here we're adjusting incoming (source) dataset in case its schema is divergent from - * the target table, to make sure it (at a bare minimum) + * Here we're processing the logical plan of the source table and optionally the target + * table to get it prepared for writing the data into the Hudi table: + * <ul> + * <li> For a target table with record key(s) configure, the source table + * [[mergeInto.sourceTable]] is used. + * <li> For a primary keyless target table, the source table [[mergeInto.sourceTable]] + * and target table [[mergeInto.targetTable]] are left-outer joined based the on the + * merge condition so that the record key stored in the record key meta column + * (`_hoodie_record_key`) are attached to the input records if they are updates. + * </ul> + * + * After getting the initial logical plan to precess as above, we're adjusting incoming + * (source) dataset in case its schema is divergent from the target table, to make sure + * it contains all the required columns for MERGE INTO (at a bare minimum) * * <ol> * <li>Contains "primary-key" column (as defined by target table's config)</li> @@ -335,7 +347,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie // for a Hudi table with record key, we use the source table and rely on Hudi's tagging // to identify inserts, updates, and deletes to avoid the join val inputPlan = if (!hasPrimaryKey()) { - // We want to join the source and target tables. + // For a primary keyless target table, join the source and target tables. // Then we want to project the output so that we have the meta columns from the target table // followed by the data columns of the source table val tableMetaCols = mergeInto.targetTable.output.filter(a => isMetaField(a.name)) @@ -343,6 +355,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie val incomingDataCols = joinData.output.filterNot(mergeInto.targetTable.outputSet.contains) Project(tableMetaCols ++ incomingDataCols, joinData) } else { + // For a target table with record key(s) configure, the source table is used mergeInto.sourceTable } @@ -782,6 +795,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie CANONICALIZE_SCHEMA.key -> "false", SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key -> "true", HoodieSparkSqlWriter.SQL_MERGE_INTO_WRITES.key -> "true", + // Only primary keyless table requires prepped keys and upsert HoodieWriteConfig.SPARK_SQL_MERGE_INTO_PREPPED_KEY -> isPrimaryKeylessTable.toString, HoodieWriteConfig.COMBINE_BEFORE_UPSERT.key() -> (!StringUtils.isNullOrEmpty(preCombineField)).toString )
