dongjoon-hyun commented on code in PR #52788:
URL: https://github.com/apache/spark/pull/52788#discussion_r2478929806
##########
connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala:
##########
@@ -167,3 +172,88 @@ private[kafka010] object KafkaOffsetReader extends Logging
{
}
}
}
+
+private[kafka010] abstract class KafkaOffsetReaderBase extends
KafkaOffsetReader with Logging {
+ protected val rangeCalculator: KafkaOffsetRangeCalculator
+
+ private def getSortedExecutorList: Array[String] = {
+ def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation):
Boolean = {
+ if (a.host == b.host) {
+ a.executorId > b.executorId
+ } else {
+ a.host > b.host
+ }
+ }
+
+ val bm = SparkEnv.get.blockManager
+ bm.master.getPeers(bm.blockManagerId).toArray
+ .map(x => ExecutorCacheTaskLocation(x.host, x.executorId))
+ .sortWith(compare)
+ .map(_.toString)
+ }
+
+ override def getOffsetRangesFromResolvedOffsets(
+ fromPartitionOffsets: PartitionOffsetMap,
+ untilPartitionOffsets: PartitionOffsetMap,
+ reportDataLoss: (String, () => Throwable) => Unit): Seq[KafkaOffsetRange]
= {
+ // Find the new partitions, and get their earliest offsets
+ val newPartitions =
untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet)
+ val newPartitionInitialOffsets = fetchEarliestOffsets(newPartitions.toSeq)
+ if (newPartitionInitialOffsets.keySet != newPartitions) {
+ // We cannot get from offsets for some partitions. It means they got
deleted.
+ val deletedPartitions =
newPartitions.diff(newPartitionInitialOffsets.keySet)
+ reportDataLoss(
+ s"Cannot find earliest offsets of ${deletedPartitions}. Some data may
have been missed",
+ () =>
+
KafkaExceptions.initialOffsetNotFoundForPartitions(deletedPartitions))
+ }
+ logInfo(log"Partitions added: ${MDC(TOPIC_PARTITION_OFFSET,
newPartitionInitialOffsets)}")
+ newPartitionInitialOffsets.filter(_._2 != 0).foreach { case (p, o) =>
+ reportDataLoss(
+ s"Added partition $p starts from $o instead of 0. Some data may have
been missed",
+ () => KafkaExceptions.addedPartitionDoesNotStartFromZero(p, o))
+ }
+
+ val deletedPartitions =
fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet)
+ if (deletedPartitions.nonEmpty) {
+ val (message, config) =
+ if (driverKafkaParams.containsKey(ConsumerConfig.GROUP_ID_CONFIG)) {
+ (s"$deletedPartitions are
gone.${KafkaSourceProvider.CUSTOM_GROUP_ID_ERROR_MESSAGE}",
+ Some(ConsumerConfig.GROUP_ID_CONFIG))
+ } else {
+ (s"$deletedPartitions are gone. Some data may have been missed.",
None)
+ }
+
+ reportDataLoss(
+ message,
+ () =>
+ KafkaExceptions.partitionsDeleted(deletedPartitions, config))
Review Comment:
Shall we merge this into one liner like the following?
```
() => KafkaExceptions.partitionsDeleted(deletedPartitions, config))
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]