soarez commented on code in PR #15136: URL: https://github.com/apache/kafka/pull/15136#discussion_r1511125756
########## core/src/main/scala/kafka/server/ReplicaManager.scala: ########## @@ -2790,7 +2791,6 @@ class ReplicaManager(val config: KafkaConfig, // is unavailable. This is required to ensure that we include the partition's // high watermark in the checkpoint file (see KAFKA-1647). val state = info.partition.toLeaderAndIsrPartitionState(tp, isNew) - val partitionAssignedDirectoryId = directoryIds.find(_._1.topicPartition() == tp).map(_._2) Review Comment: It seems the changes in this file are no longer necessary? ########## core/src/main/scala/kafka/log/LogManager.scala: ########## @@ -1173,6 +1173,35 @@ class LogManager(logDirs: Seq[File], } } + def recoverAbandonedFutureLogs(brokerId: Int, newTopicsImage: TopicsImage): Unit = { + val abandonedFutureLogs = findAbandonedFutureLogs(brokerId, newTopicsImage) + abandonedFutureLogs.foreach { log => + val tp = log.topicPartition + + log.renameDir(UnifiedLog.logDirName(tp), shouldReinitialize = true) + log.removeLogMetrics() + futureLogs.remove(tp) + + currentLogs.put(tp, log) + log.newMetrics() + + info(s"Successfully renamed abandoned future log for $tp") + } + } + + private def findAbandonedFutureLogs(brokerId: Int, newTopicsImage: TopicsImage): Iterable[UnifiedLog] = { + futureLogs.values.flatMap { log => + val topicId = log.topicId.getOrElse { + throw new RuntimeException(s"The log dir $log does not have a topic ID, " + + "which is not allowed when running in KRaft mode.") + } + val partitionId = log.topicPartition.partition() + Option(newTopicsImage.getPartition(topicId, partitionId)) + .filter(pr => directoryId(log.parentDir).contains(pr.directory(brokerId))) + .map(_ => log) Review Comment: 1. Do we have the guarantee that the topic will be in the new topics image? Can't the topic be in later metadata delta if a compaction hasn't yet occurred? 2. Shouldn't we check if the main replica is on one of the other directories? If the main replica is on an offline dir (as in the scenario described in the issue) the broker will refuse to start once it is restarted with that directory online, as it will see the two main replicas. If the log directory for the main replica is online, we should be able to detect that here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org