Re: [PR] [MINOR][SQL] Extract common methods to KafkaOffsetReaderBase [spark]

via GitHub Thu, 30 Oct 2025 10:23:36 -0700


dongjoon-hyun commented on code in PR #52788:
URL: https://github.com/apache/spark/pull/52788#discussion_r2478929806



##########
connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala:
##########
@@ -167,3 +172,88 @@ private[kafka010] object KafkaOffsetReader extends Logging 
{
     }
   }
 }
+
+private[kafka010] abstract class KafkaOffsetReaderBase extends 
KafkaOffsetReader with Logging {
+  protected val rangeCalculator: KafkaOffsetRangeCalculator
+
+  private def getSortedExecutorList: Array[String] = {
+    def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): 
Boolean = {
+      if (a.host == b.host) {
+        a.executorId > b.executorId
+      } else {
+        a.host > b.host
+      }
+    }
+
+    val bm = SparkEnv.get.blockManager
+    bm.master.getPeers(bm.blockManagerId).toArray
+      .map(x => ExecutorCacheTaskLocation(x.host, x.executorId))
+      .sortWith(compare)
+      .map(_.toString)
+  }
+
+  override def getOffsetRangesFromResolvedOffsets(
+     fromPartitionOffsets: PartitionOffsetMap,
+     untilPartitionOffsets: PartitionOffsetMap,
+     reportDataLoss: (String, () => Throwable) => Unit): Seq[KafkaOffsetRange] 
= {
+    // Find the new partitions, and get their earliest offsets
+    val newPartitions = 
untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet)
+    val newPartitionInitialOffsets = fetchEarliestOffsets(newPartitions.toSeq)
+    if (newPartitionInitialOffsets.keySet != newPartitions) {
+      // We cannot get from offsets for some partitions. It means they got 
deleted.
+      val deletedPartitions = 
newPartitions.diff(newPartitionInitialOffsets.keySet)
+      reportDataLoss(
+        s"Cannot find earliest offsets of ${deletedPartitions}. Some data may 
have been missed",
+        () =>
+          
KafkaExceptions.initialOffsetNotFoundForPartitions(deletedPartitions))
+    }
+    logInfo(log"Partitions added: ${MDC(TOPIC_PARTITION_OFFSET, 
newPartitionInitialOffsets)}")
+    newPartitionInitialOffsets.filter(_._2 != 0).foreach { case (p, o) =>
+      reportDataLoss(
+        s"Added partition $p starts from $o instead of 0. Some data may have 
been missed",
+        () => KafkaExceptions.addedPartitionDoesNotStartFromZero(p, o))
+    }
+
+    val deletedPartitions = 
fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet)
+    if (deletedPartitions.nonEmpty) {
+      val (message, config) =
+        if (driverKafkaParams.containsKey(ConsumerConfig.GROUP_ID_CONFIG)) {
+          (s"$deletedPartitions are 
gone.${KafkaSourceProvider.CUSTOM_GROUP_ID_ERROR_MESSAGE}",
+            Some(ConsumerConfig.GROUP_ID_CONFIG))
+        } else {
+          (s"$deletedPartitions are gone. Some data may have been missed.", 
None)
+        }
+
+      reportDataLoss(
+        message,
+        () =>
+          KafkaExceptions.partitionsDeleted(deletedPartitions, config))

Review Comment:
   Shall we merge this into one liner like the following?
   ```
   () => KafkaExceptions.partitionsDeleted(deletedPartitions, config))
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [MINOR][SQL] Extract common methods to KafkaOffsetReaderBase [spark]

Reply via email to