attilapiros commented on a change in pull request #34234:
URL: https://github.com/apache/spark/pull/34234#discussion_r783072141
##########
File path: core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
##########
@@ -191,4 +191,70 @@ class MapStatusSuite extends SparkFunSuite {
assert(count === 3000)
}
}
+
+ def compressAndDecompressSize(size: Long): Long = {
+ MapStatus.decompressSize(MapStatus.compressSize(size))
+ }
+
+ test("SPARK-36967: HighlyCompressedMapStatus should record accurately the
size " +
+ "of skewed shuffle blocks") {
+ val conf = new
SparkConf().set(config.SHUFFLE_ACCURATE_BLOCK_SKEWED_FACTOR.key, "5")
+ val env = mock(classOf[SparkEnv])
+ doReturn(conf).when(env).conf
+ SparkEnv.set(env)
+
+ val smallBlockSizes = Array.tabulate[Long](2889)(i => i)
+ val skewBlocksSizes = Array.tabulate[Long](10)(i => i + 350 * 1024)
+ val sizes = smallBlockSizes ++: skewBlocksSizes
+ val avg = smallBlockSizes.sum / smallBlockSizes.length
+ val loc = BlockManagerId("a", "b", 10)
+ val mapTaskAttemptId = 5
+ val status = MapStatus(loc, sizes, mapTaskAttemptId)
+ val status1 = compressAndDecompressMapStatus(status)
+ assert(status1.isInstanceOf[HighlyCompressedMapStatus])
+ assert(status1.location == loc)
+ assert(status1.mapId == mapTaskAttemptId)
+ assert(status1.getSizeForBlock(0) == 0)
+ for (i <- 1 until smallBlockSizes.length) {
+ assert(status1.getSizeForBlock(i) === avg)
+ }
+ for (i <- 0 until skewBlocksSizes.length) {
+ assert(status1.getSizeForBlock(smallBlockSizes.length + i) ===
+ compressAndDecompressSize(skewBlocksSizes(i)))
+ }
+ }
+
+ test("SPARK-36967: Limit accurate skewed block number if too many blocks are
skewed") {
+ val skewedBlockNumber = 20
Review comment:
nit: skewedBlockNumber => trackedSkewedBlocksLength
##########
File path: core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
##########
@@ -191,4 +191,70 @@ class MapStatusSuite extends SparkFunSuite {
assert(count === 3000)
}
}
+
+ def compressAndDecompressSize(size: Long): Long = {
+ MapStatus.decompressSize(MapStatus.compressSize(size))
+ }
+
+ test("SPARK-36967: HighlyCompressedMapStatus should record accurately the
size " +
+ "of skewed shuffle blocks") {
+ val conf = new
SparkConf().set(config.SHUFFLE_ACCURATE_BLOCK_SKEWED_FACTOR.key, "5")
+ val env = mock(classOf[SparkEnv])
+ doReturn(conf).when(env).conf
+ SparkEnv.set(env)
+
+ val smallBlockSizes = Array.tabulate[Long](2889)(i => i)
+ val skewBlocksSizes = Array.tabulate[Long](10)(i => i + 350 * 1024)
+ val sizes = smallBlockSizes ++: skewBlocksSizes
+ val avg = smallBlockSizes.sum / smallBlockSizes.length
+ val loc = BlockManagerId("a", "b", 10)
+ val mapTaskAttemptId = 5
+ val status = MapStatus(loc, sizes, mapTaskAttemptId)
+ val status1 = compressAndDecompressMapStatus(status)
+ assert(status1.isInstanceOf[HighlyCompressedMapStatus])
+ assert(status1.location == loc)
+ assert(status1.mapId == mapTaskAttemptId)
+ assert(status1.getSizeForBlock(0) == 0)
+ for (i <- 1 until smallBlockSizes.length) {
+ assert(status1.getSizeForBlock(i) === avg)
+ }
+ for (i <- 0 until skewBlocksSizes.length) {
+ assert(status1.getSizeForBlock(smallBlockSizes.length + i) ===
+ compressAndDecompressSize(skewBlocksSizes(i)))
+ }
+ }
+
+ test("SPARK-36967: Limit accurate skewed block number if too many blocks are
skewed") {
+ val skewedBlockNumber = 20
+ val conf =
+ new SparkConf()
+ .set(config.SHUFFLE_ACCURATE_BLOCK_SKEWED_FACTOR.key, "5")
+ .set(config.SHUFFLE_MAX_ACCURATE_SKEWED_BLOCK_NUMBER.key,
skewedBlockNumber.toString)
+ val env = mock(classOf[SparkEnv])
+ doReturn(conf).when(env).conf
+ SparkEnv.set(env)
+
+ val sizes: Array[Long] = Array.tabulate[Long](2500)(i => i) ++:
+ Array.tabulate[Long](500)(i => i + 3500 * 1024)
+ val emptyBlocksSize = 1
+ val smallBlockSizes = sizes.slice(emptyBlocksSize, sizes.size -
skewedBlockNumber)
+ val skewBlocksSizes = sizes.slice(sizes.size - skewedBlockNumber,
sizes.size)
Review comment:
Here I used the old `skewedBlockNumber`, WDYT?
```suggestion
val untrackedSkewedBlocksLength = 500
val emptyBlocks = Array(0)
val smallBlockSizes = Array.tabulate[Long](2500)(i => i + 1)
val untrackedSkewedBlocksSizes =
Array.tabulate[Long](untrackedSkewedBlocksLength)(i => i + 3500 * 1024)
val trackedSkewedBlocksSizes = Array.tabulate[Long](skewedBlockNumber)(i
=> i + 4500 * 1024)
val nonEmptyBlocks = smallBlockSizes ++: untrackedSkewedBlocksSizes ++:
trackedSkewedBlocksSizes
val sizes = emptyBlocks ++: nonEmptyBlocks
val avgNonEmpty = nonEmptyBlocks.sum / nonEmptyBlocks.size
assert(nonEmptyBlocks.filter(_ > avgNonEmpty).size ==
untrackedSkewedBlocksLength + skewedBlockNumber, "number of skewed block sizes")
val smallAndUntrackedBlockSizes = nonEmptyBlocks.slice(0,
nonEmptyBlocks.size - skewedBlockNumber)
val avg = smallAndUntrackedBlockSizes.sum /
smallAndUntrackedBlockSizes.length
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]