attilapiros commented on a change in pull request #24554: [SPARK-27622][Core] 
Avoiding the network when block manager fetches disk persisted RDD blocks from 
the same host
URL: https://github.com/apache/spark/pull/24554#discussion_r286938419
 
 

 ##########
 File path: 
core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
 ##########
 @@ -438,12 +439,27 @@ class BlockManagerMasterEndpoint(
     if (blockLocations.containsKey(blockId)) blockLocations.get(blockId).toSeq 
else Seq.empty
   }
 
-  private def getLocationsAndStatus(blockId: BlockId): 
Option[BlockLocationsAndStatus] = {
+  private def getLocationsAndStatus(
+      blockId: BlockId, requesterHost: String): 
Option[BlockLocationsAndStatus] = {
     val locations = 
Option(blockLocations.get(blockId)).map(_.toSeq).getOrElse(Seq.empty)
     val status = locations.headOption.flatMap { bmId => 
blockManagerInfo(bmId).getStatus(blockId) }
 
     if (locations.nonEmpty && status.isDefined) {
-      Some(BlockLocationsAndStatus(locations, status.get))
+      val bmIdToLocalDirs = if (status.get.storageLevel.useDisk) {
+        locations
+          .find(_.host == requesterHost)
+          .map { sameHostBlockId =>
+            val bmInfo = blockManagerInfo(sameHostBlockId)
+            bmInfo.blockManagerId -> bmInfo.localDirs
+          }
+      } else {
+        None
+      }
+      bmIdToLocalDirs.map { case (bmId, localDirs) =>
+        BlockLocationsAndStatus(locations.filter(_ != bmId), status.get, 
Some(localDirs))
 
 Review comment:
   I was thinking about this case and you are right better not use filterer out 
this one item from the locations: although this method is only used by one RPC 
call `GetLocationsAndStatus` which only used by the 
`BlockManager#getRemoteManagedBuffer` where the `localDirs` are used for 
accessing the block content and if that fails still there is some chance that 
this could be loaded from Memory (if the block was in Memory then spilled to 
Disk then promoted to Memory latter on, like Imran mentioned in #24499 ).
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to