maheshk114 commented on code in PR #45228:
URL: https://github.com/apache/spark/pull/45228#discussion_r1544017626
##########
core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala:
##########
@@ -398,8 +410,13 @@ final class ShuffleBlockFetcherIterator(
var pushMergedLocalBlockBytes = 0L
val prevNumBlocksToFetch = numBlocksToFetch
- val fallback = FallbackStorage.FALLBACK_BLOCK_MANAGER_ID.executorId
- val localExecIds = Set(blockManager.blockManagerId.executorId, fallback)
+ // Fallback to original implementation, if thread pool is not enabled.
+ val localExecIds = if
(FallbackStorage.getNumReadThreads(blockManager.conf) > 0) {
Review Comment:
The original code was written to read the local shuffle data in the same
thread to avoid any thread creation overhead. So i wanted to keep the same
behavior for local read and use a thread pool for read from external storage.
##########
core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala:
##########
@@ -92,6 +95,51 @@ private[storage] class FallbackStorage(conf: SparkConf)
extends Logging {
val hash = JavaUtils.nonNegativeHash(filename)
fallbackFileSystem.exists(new Path(fallbackPath,
s"$appId/$shuffleId/$hash/$filename"))
}
+
+ private val fetchThreadPool: Option[ThreadPoolExecutor] = {
+ val numShuffleThreads = FallbackStorage.getNumReadThreads(conf)
+ if (numShuffleThreads > 0) {
+ logInfo(s"FallbackStorage created thread pool using
${numShuffleThreads} thread(s)")
+ Some(ThreadUtils.newDaemonCachedThreadPool(
+ "FetchFromFallbackStorage-threadPool", numShuffleThreads))
+ } else {
+ logInfo("FallbackStorage thread pool not created")
+ None
+ }
+ }
+
+ def fetchBlocks(
+ blockManager: BlockManager,
+ blocks: collection.Seq[FetchBlockInfo],
+ address: BlockManagerId,
+ listener: BlockFetchingListener): Unit = {
+ fetchThreadPool match {
+ case Some(p) if !p.isShutdown =>
+ blocks.foreach(block =>
+ p.submit(new Runnable {
+ override def run(): Unit = {
+ fetchShuffleBlocks(block, blockManager, listener)
+ }
+ })
+ )
+ case _ =>
+ logInfo(s" fetchThreadPool does not exists for $address or shutdown")
+ blocks.foreach(block => fetchShuffleBlocks(block, blockManager,
listener))
+ }
+ }
+
+ private def fetchShuffleBlocks(
+ block: FetchBlockInfo,
+ blockManager: BlockManager,
Review Comment:
In my first commit, the logic was to read from local disk first and try
external storage only in case of IO exception. I removed that logic but forgot
to remove the blockManager. Will remove it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]