jackylee-ch commented on code in PR #11232:
URL:
https://github.com/apache/incubator-gluten/pull/11232#discussion_r2580664445
##########
gluten-substrait/src/main/scala/org/apache/gluten/utils/PartitionsUtil.scala:
##########
@@ -176,58 +169,62 @@ object PartitionsUtil {
heap.enqueue((load + sz, numFiles + 1, idx))
}
+ def initializeHeap(
+ ordering: Ordering[(Long, Int, Int)]): mutable.PriorityQueue[(Long,
Int, Int)] = {
+ val heap = mutable.PriorityQueue.empty[(Long, Int, Int)](ordering)
+ inputPartitions.indices.foreach(i => heap.enqueue((0L, 0, i)))
+ heap
+ }
+
+ // Flatten and sort descending by file size.
+ val filesSorted: Seq[(PartitionedFile, Long)] =
+ inputPartitions
+ .flatMap(_.files)
+ .map(f => (f, f.length))
+ .sortBy(_._2)(Ordering.Long.reverse)
+
// First by load, then by numFiles.
- val heapByFileSize =
- mutable.PriorityQueue.empty[(Long, Int, Int)](
- Ordering
- .by[(Long, Int, Int), (Long, Int)] {
- case (load, numFiles, _) =>
- (load, numFiles)
- }
- .reverse
- )
+ val loadFirstOrdering = Ordering
+ .by[(Long, Int, Int), (Long, Int)] { case (load, numFiles, _) => (load,
numFiles) }
+ .reverse
if (smallFileThreshold > 0) {
val smallFileTotalSize = filesSorted.map(_._2).sum * smallFileThreshold
// First by numFiles, then by load.
- val heapByFileNum =
- mutable.PriorityQueue.empty[(Long, Int, Int)](
- Ordering
- .by[(Long, Int, Int), (Int, Long)] {
- case (load, numFiles, _) =>
- (numFiles, load)
- }
- .reverse
- )
-
- inputPartitions.indices.foreach(i => heapByFileNum.enqueue((0L, 0, i)))
+ val numFirstOrdering = Ordering
+ .by[(Long, Int, Int), (Int, Long)] { case (load, numFiles, _) =>
(numFiles, load) }
Review Comment:
nit: `load` -> 'size'
##########
gluten-substrait/src/main/scala/org/apache/gluten/utils/PartitionsUtil.scala:
##########
@@ -176,58 +169,62 @@ object PartitionsUtil {
heap.enqueue((load + sz, numFiles + 1, idx))
}
+ def initializeHeap(
+ ordering: Ordering[(Long, Int, Int)]): mutable.PriorityQueue[(Long,
Int, Int)] = {
+ val heap = mutable.PriorityQueue.empty[(Long, Int, Int)](ordering)
+ inputPartitions.indices.foreach(i => heap.enqueue((0L, 0, i)))
+ heap
+ }
+
+ // Flatten and sort descending by file size.
+ val filesSorted: Seq[(PartitionedFile, Long)] =
+ inputPartitions
+ .flatMap(_.files)
+ .map(f => (f, f.length))
+ .sortBy(_._2)(Ordering.Long.reverse)
+
// First by load, then by numFiles.
- val heapByFileSize =
- mutable.PriorityQueue.empty[(Long, Int, Int)](
- Ordering
- .by[(Long, Int, Int), (Long, Int)] {
- case (load, numFiles, _) =>
- (load, numFiles)
- }
- .reverse
- )
+ val loadFirstOrdering = Ordering
Review Comment:
nit: `loadFirstOrdering` -> `sizeFirstOrdering`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]