[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group

2020-03-04 Thread GitBox
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] 
ShuffleBlockFetcherIterator should always create request for last block group
URL: https://github.com/apache/spark/pull/27786#discussion_r387717872
 
 

 ##
 File path: 
core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
 ##
 @@ -341,32 +341,86 @@ class ShuffleBlockFetcherIteratorSuite extends 
SparkFunSuite with PrivateMethodT
 assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size 
=== 1)
   }
 
-  test("fetch continuous blocks in batch respects maxSize and maxBlocks") {
+  test("fetch continuous blocks in batch should respect maxBytesInFlight") {
 val blockManager = mock(classOf[BlockManager])
 val localBmId = BlockManagerId("test-client", "test-local-host", 1)
 doReturn(localBmId).when(blockManager).blockManagerId
 
 // Make sure remote blocks would return the merged block
-val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2)
-val remoteBlocks = Seq[BlockId](
+val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1)
+val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2)
+val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _))
+val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), 
ShuffleBlockId(0, 4, 1))
+val mergedRemoteBlocks = Map[BlockId, ManagedBuffer](
+  ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer())
+val transfer = createMockTransfer(mergedRemoteBlocks)
+
+val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])](
+  (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))),
+  (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 
1.toIterator
+
+val taskContext = TaskContext.empty()
+val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics()
+val iterator = new ShuffleBlockFetcherIterator(
+  taskContext,
+  transfer,
+  blockManager,
+  blocksByAddress,
+  (_, in) => in,
+  1500,
+  Int.MaxValue,
+  Int.MaxValue,
+  Int.MaxValue,
+  true,
+  false,
+  metrics,
+  true)
+
+var numResults = 0
+// After initialize(), there will be 6 FetchRequests. And each of the 
first 5 requests
+// includes 1 merged block which is merged from 3 shuffle blocks. The last 
request has 1 merged
+// block which merged from 2 shuffle blocks. So, only the first 5 
requests(5 * 3 * 100 >= 1500)
+// can be sent. The second FetchRequest will hit 
maxBlocksInFlightPerAddress so it won't
 
 Review comment:
   `The second` -> `The 6th`?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group

2020-03-04 Thread GitBox
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] 
ShuffleBlockFetcherIterator should always create request for last block group
URL: https://github.com/apache/spark/pull/27786#discussion_r387570436
 
 

 ##
 File path: 
core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
 ##
 @@ -341,32 +341,84 @@ class ShuffleBlockFetcherIteratorSuite extends 
SparkFunSuite with PrivateMethodT
 assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size 
=== 1)
   }
 
-  test("fetch continuous blocks in batch respects maxSize and maxBlocks") {
+  test("fetch continuous blocks in batch should respect maxBytesInFlight") {
 val blockManager = mock(classOf[BlockManager])
 val localBmId = BlockManagerId("test-client", "test-local-host", 1)
 doReturn(localBmId).when(blockManager).blockManagerId
 
 // Make sure remote blocks would return the merged block
-val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2)
-val remoteBlocks = Seq[BlockId](
+val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1)
+val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2)
+val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _))
+val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), 
ShuffleBlockId(0, 4, 1))
+val mergedRemoteBlocks = Map[BlockId, ManagedBuffer](
+  ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer())
+val transfer = createMockTransfer(mergedRemoteBlocks)
+
+val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])](
+  (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))),
+  (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 
1.toIterator
+
+val taskContext = TaskContext.empty()
+val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics()
+val iterator = new ShuffleBlockFetcherIterator(
+  taskContext,
+  transfer,
+  blockManager,
+  blocksByAddress,
+  (_, in) => in,
+  1500,
+  Int.MaxValue,
+  Int.MaxValue,
+  Int.MaxValue,
+  true,
+  false,
+  metrics,
+  true)
+
+var numResults = 0
+// After initialize(), there will be 6 FetchRequests, and the each of the 
first 5
+// includes 3 merged blocks and the last one has 1 merged block. So, only 
the
 
 Review comment:
   ok let's update


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group

2020-03-04 Thread GitBox
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] 
ShuffleBlockFetcherIterator should always create request for last block group
URL: https://github.com/apache/spark/pull/27786#discussion_r387524638
 
 

 ##
 File path: 
core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
 ##
 @@ -341,32 +341,84 @@ class ShuffleBlockFetcherIteratorSuite extends 
SparkFunSuite with PrivateMethodT
 assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size 
=== 1)
   }
 
-  test("fetch continuous blocks in batch respects maxSize and maxBlocks") {
+  test("fetch continuous blocks in batch should respect maxBytesInFlight") {
 val blockManager = mock(classOf[BlockManager])
 val localBmId = BlockManagerId("test-client", "test-local-host", 1)
 doReturn(localBmId).when(blockManager).blockManagerId
 
 // Make sure remote blocks would return the merged block
-val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2)
-val remoteBlocks = Seq[BlockId](
+val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1)
+val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2)
+val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _))
+val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), 
ShuffleBlockId(0, 4, 1))
+val mergedRemoteBlocks = Map[BlockId, ManagedBuffer](
+  ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer())
+val transfer = createMockTransfer(mergedRemoteBlocks)
+
+val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])](
+  (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))),
+  (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 
1.toIterator
+
+val taskContext = TaskContext.empty()
+val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics()
+val iterator = new ShuffleBlockFetcherIterator(
+  taskContext,
+  transfer,
+  blockManager,
+  blocksByAddress,
+  (_, in) => in,
+  1500,
+  Int.MaxValue,
+  Int.MaxValue,
+  Int.MaxValue,
+  true,
+  false,
+  metrics,
+  true)
+
+var numResults = 0
+// After initialize(), there will be 6 FetchRequests, and the each of the 
first 5
+// includes 3 merged blocks and the last one has 1 merged block. So, only 
the
 
 Review comment:
   or do you mean shuffle blocks?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group

2020-03-04 Thread GitBox
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] 
ShuffleBlockFetcherIterator should always create request for last block group
URL: https://github.com/apache/spark/pull/27786#discussion_r387524362
 
 

 ##
 File path: 
core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
 ##
 @@ -341,32 +341,84 @@ class ShuffleBlockFetcherIteratorSuite extends 
SparkFunSuite with PrivateMethodT
 assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size 
=== 1)
   }
 
-  test("fetch continuous blocks in batch respects maxSize and maxBlocks") {
+  test("fetch continuous blocks in batch should respect maxBytesInFlight") {
 val blockManager = mock(classOf[BlockManager])
 val localBmId = BlockManagerId("test-client", "test-local-host", 1)
 doReturn(localBmId).when(blockManager).blockManagerId
 
 // Make sure remote blocks would return the merged block
-val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2)
-val remoteBlocks = Seq[BlockId](
+val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1)
+val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2)
+val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _))
+val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), 
ShuffleBlockId(0, 4, 1))
+val mergedRemoteBlocks = Map[BlockId, ManagedBuffer](
+  ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(),
+  ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer())
+val transfer = createMockTransfer(mergedRemoteBlocks)
+
+val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])](
+  (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))),
+  (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 
1.toIterator
+
+val taskContext = TaskContext.empty()
+val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics()
+val iterator = new ShuffleBlockFetcherIterator(
+  taskContext,
+  transfer,
+  blockManager,
+  blocksByAddress,
+  (_, in) => in,
+  1500,
+  Int.MaxValue,
+  Int.MaxValue,
+  Int.MaxValue,
+  true,
+  false,
+  metrics,
+  true)
+
+var numResults = 0
+// After initialize(), there will be 6 FetchRequests, and the each of the 
first 5
+// includes 3 merged blocks and the last one has 1 merged block. So, only 
the
 
 Review comment:
   there are 6 merged blocks in total, how can each request includes 3 merged 
blocks?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group

2020-03-03 Thread GitBox
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] 
ShuffleBlockFetcherIterator should always create request for last block group
URL: https://github.com/apache/spark/pull/27786#discussion_r387497757
 
 

 ##
 File path: 
core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
 ##
 @@ -367,12 +367,12 @@ final class ShuffleBlockFetcherIterator(
   // For batch fetch, the actual block in flight should count for merged 
block.
   val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= 
maxBlocksInFlightPerAddress
   if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) {
-createFetchRequests()
+createFetchRequests(true)
 
 Review comment:
   let's write down the parameter name.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group

2020-03-03 Thread GitBox
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] 
ShuffleBlockFetcherIterator should always create request for last block group
URL: https://github.com/apache/spark/pull/27786#discussion_r387497646
 
 

 ##
 File path: 
core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
 ##
 @@ -339,14 +339,14 @@ final class ShuffleBlockFetcherIterator(
 + s"with ${blocks.size} blocks")
 }
 
-def createFetchRequests(): Unit = {
+def createFetchRequests(hasMore: Boolean): Unit = {
 
 Review comment:
   nit: `isLast`?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org