Re: [PR] [SPARK-56870][SDP] Implement SCD1 Batch Processor; Extend Microbatch with CDC Metadata [spark]

via GitHub Tue, 19 May 2026 16:07:55 -0700


AnishMahto commented on code in PR #55970:
URL: https://github.com/apache/spark/pull/55970#discussion_r3270220391



##########
sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala:
##########
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.pipelines.autocdc
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{functions => F, AnalysisException}
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.util.QuotingUtils
+import org.apache.spark.sql.classic.DataFrame
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.util.ArrayImplicits._
+
+/**
+ * Per-microbatch processor for SCD Type 1 AutoCDC flows, complying to the 
specified [[changeArgs]]
+ * configuration.
+ *
+ * @param changeArgs The CDC flow configuration.
+ * @param resolvedSequencingType The post-analysis [[DataType]] of the 
sequencing column, derived
+ *                               from the flow's resolved DataFrame at flow 
setup time.
+ */
+case class Scd1BatchProcessor(
+    changeArgs: ChangeArgs,
+    resolvedSequencingType: DataType) {
+
+  /**
+   * Deduplicate the incoming CDC microbatch by key, keeping the most recent 
event per key
+   * as ordered by [[ChangeArgs.sequencing]].
+   *
+   * For SCD1 we only care about the most recent (by sequence value) event per 
key. When
+   * multiple events share the same key and the same sequence value, the row 
selected is
+   * non-deterministic and undefined.
+   *
+   * The schema of the returned dataframe matches the schema of the microbatch 
exactly.
+   */
+  def deduplicateMicrobatch(microbatchDf: DataFrame): DataFrame = {
+    // The `max_by` API can only return a single column, so pack/unpack the 
entire row into a
+    // temporary column before and after the `max_by` operation.
+    val winningRowCol = OutOfOrderCdcMergeUtils.tempColName("__winning_row")
+
+    val allMicrobatchColumns =
+      microbatchDf.columns
+        .map(colName => F.col(QuotingUtils.quoteIdentifier(colName)))
+        .toImmutableArraySeq
+
+    microbatchDf
+      .groupBy(changeArgs.keys.map(k => F.col(k.quoted)): _*)
+      .agg(
+        F.max_by(F.struct(allMicrobatchColumns: _*), changeArgs.sequencing)
+          .as(winningRowCol)
+      )
+      .select(F.col(s"$winningRowCol.*"))
+  }
+
+  /**
+   * Project the CDC metadata column onto the microbatch.
+   *
+   * The returned dataframe has all of the columns in the input microbatch + 
the CDC metadata
+   * column.
+   */
+  def extendMicrobatchRowsWithCdcMetadata(microbatchDf: DataFrame): DataFrame 
= {
+    // Proactively validate the reserved CDC metadata column does not exist in 
the microbatch.
+    validateCdcMetadataColumnNotPresent(microbatchDf)
+
+    val rowDeleteSequence: Column = changeArgs.deleteCondition match {
+      case Some(deleteCondition) =>
+        F.when(deleteCondition, changeArgs.sequencing).otherwise(F.lit(null))
+      case None =>
+        F.lit(null)
+    }
+
+    val rowUpsertSequence: Column =
+      // A row that is not a delete must be an upsert, these are mutually 
exclusive and a complete
+      // set of CDC event types.
+      F.when(rowDeleteSequence.isNull, 
changeArgs.sequencing).otherwise(F.lit(null))
+
+    microbatchDf.withColumn(
+      Scd1BatchProcessor.cdcMetadataColName,
+      Scd1BatchProcessor.constructCdcMetadataCol(
+        deleteSequence = rowDeleteSequence,
+        upsertSequence = rowUpsertSequence,
+        sequencingType = resolvedSequencingType
+      )
+    )
+  }
+
+  private def validateCdcMetadataColumnNotPresent(microbatchDf: DataFrame): 
Unit = {
+    val sqlConf = microbatchDf.sparkSession.sessionState.conf
+    val resolver = sqlConf.resolver
+
+    microbatchDf.schema.fieldNames
+      .find(resolver(_, Scd1BatchProcessor.cdcMetadataColName))
+      .foreach { conflictingColumnName =>
+        throw new AnalysisException(
+          errorClass = "AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT",
+          messageParameters = Map(
+            "caseSensitivity" -> 
CaseSensitivityLabels.of(!sqlConf.caseSensitiveAnalysis),
+            "columnName" -> conflictingColumnName,
+            "schemaName" -> "microbatch",
+            "reservedColumnName" -> Scd1BatchProcessor.cdcMetadataColName
+          )
+        )
+      }
+  }
+}
+
+object Scd1BatchProcessor {
+  private[autocdc] val cdcMetadataColName: String = "_cdc_metadata"
+
+  private[autocdc] val cdcDeleteSequenceFieldName: String = "deleteSequence"

Review Comment:
   Oops, PR description had some stale terminology. Updated both PR and ticket 
descriptions.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-56870][SDP] Implement SCD1 Batch Processor; Extend Microbatch with CDC Metadata [spark]

Reply via email to