This is an automated email from the ASF dual-hosted git repository.

vhs pushed a commit to branch release-1.0.2
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit 67827dfe1b1fb745d5a2ab46eebe83661d1d431f
Author: Y Ethan Guo <[email protected]>
AuthorDate: Fri Apr 4 22:45:44 2025 -0700

    [MINOR] Update javadocs in MergeIntoHoodieTableCommand (#13093)
    
    (cherry picked from commit 048c03123956388773f3a529e59494e30c75442f)
---
 .../hudi/command/MergeIntoHoodieTableCommand.scala   | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala
index 2551f7cbcf9..f6221abe849 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala
@@ -291,8 +291,20 @@ case class MergeIntoHoodieTableCommand(mergeInto: 
MergeIntoTable) extends Hoodie
   }
 
   /**
-   * Here we're adjusting incoming (source) dataset in case its schema is 
divergent from
-   * the target table, to make sure it (at a bare minimum)
+   * Here we're processing the logical plan of the source table and optionally 
the target
+   * table to get it prepared for writing the data into the Hudi table:
+   * <ul>
+   * <li> For a target table with record key(s) configure, the source table
+   * [[mergeInto.sourceTable]] is used.
+   * <li> For a primary keyless target table, the source table 
[[mergeInto.sourceTable]]
+   * and target table [[mergeInto.targetTable]] are left-outer joined based 
the on the
+   * merge condition so that the record key stored in the record key meta 
column
+   * (`_hoodie_record_key`) are attached to the input records if they are 
updates.
+   * </ul>
+   *
+   * After getting the initial logical plan to precess as above, we're 
adjusting incoming
+   * (source) dataset in case its schema is divergent from the target table, 
to make sure
+   * it contains all the required columns for MERGE INTO (at a bare minimum)
    *
    * <ol>
    *   <li>Contains "primary-key" column (as defined by target table's 
config)</li>
@@ -335,7 +347,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: 
MergeIntoTable) extends Hoodie
     // for a Hudi table with record key, we use the source table and rely on 
Hudi's tagging
     // to identify inserts, updates, and deletes to avoid the join
     val inputPlan = if (!hasPrimaryKey()) {
-      // We want to join the source and target tables.
+      // For a primary keyless target table, join the source and target tables.
       // Then we want to project the output so that we have the meta columns 
from the target table
       // followed by the data columns of the source table
       val tableMetaCols = mergeInto.targetTable.output.filter(a => 
isMetaField(a.name))
@@ -343,6 +355,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: 
MergeIntoTable) extends Hoodie
       val incomingDataCols = 
joinData.output.filterNot(mergeInto.targetTable.outputSet.contains)
       Project(tableMetaCols ++ incomingDataCols, joinData)
     } else {
+      // For a target table with record key(s) configure, the source table is 
used
       mergeInto.sourceTable
     }
 
@@ -782,6 +795,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: 
MergeIntoTable) extends Hoodie
       CANONICALIZE_SCHEMA.key -> "false",
       SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key -> "true",
       HoodieSparkSqlWriter.SQL_MERGE_INTO_WRITES.key -> "true",
+      // Only primary keyless table requires prepped keys and upsert
       HoodieWriteConfig.SPARK_SQL_MERGE_INTO_PREPPED_KEY -> 
isPrimaryKeylessTable.toString,
       HoodieWriteConfig.COMBINE_BEFORE_UPSERT.key() -> 
(!StringUtils.isNullOrEmpty(preCombineField)).toString
     )

Reply via email to