This is an automated email from the ASF dual-hosted git repository.

hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 3d21141f0 [GLUTEN-7243][VL] Fix Q97 cross-task spilling hangs (#7244)
3d21141f0 is described below

commit 3d21141f0872d1a130994a7db4080feb3ba8781f
Author: Hongze Zhang <[email protected]>
AuthorDate: Sat Sep 14 20:57:02 2024 +0800

    [GLUTEN-7243][VL] Fix Q97 cross-task spilling hangs (#7244)
    
    Closes #7243
---
 .github/workflows/velox_backend.yml           |  1 -
 cpp/velox/compute/WholeStageResultIterator.cc | 19 +++++++++++++------
 ep/build-velox/src/get_velox.sh               |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/velox_backend.yml 
b/.github/workflows/velox_backend.yml
index 572a2a2d8..cd0fd1cd4 100644
--- a/.github/workflows/velox_backend.yml
+++ b/.github/workflows/velox_backend.yml
@@ -391,7 +391,6 @@ jobs:
             
-d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0
       - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low 
memory, IO threads off
         continue-on-error: true # OOM
-        timeout-minutes: 15 # 
https://github.com/apache/incubator-gluten/issues/7243
         run: |
           cd tools/gluten-it \
           && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
diff --git a/cpp/velox/compute/WholeStageResultIterator.cc 
b/cpp/velox/compute/WholeStageResultIterator.cc
index 4aa3449b6..1da5633df 100644
--- a/cpp/velox/compute/WholeStageResultIterator.cc
+++ b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -231,20 +231,27 @@ int64_t WholeStageResultIterator::spillFixedSize(int64_t 
size) {
   std::string poolName{pool->root()->name() + "/" + pool->name()};
   std::string logPrefix{"Spill[" + poolName + "]: "};
   int64_t shrunken = memoryManager_->shrink(size);
-  // todo return the actual spilled size?
   if (spillStrategy_ == "auto") {
+    if (task_->numThreads() != 0) {
+      // Task should have zero running threads, otherwise there's
+      // possibility that this spill call hangs. See 
https://github.com/apache/incubator-gluten/issues/7243.
+      // As of now, non-zero running threads usually happens when:
+      // 1. Task A spills task B;
+      // 2. Task A trys to grow buffers created by task B, during which spill 
is requested on task A again;
+      VLOG(2) << logPrefix << "Spill is requested on a task " << 
task_->taskId()
+              << " that has non-zero running threads, which is not currently 
supported. Skipping.";
+      return shrunken;
+    }
     int64_t remaining = size - shrunken;
-    LOG(INFO) << logPrefix << "Trying to request spilling for " << remaining 
<< " bytes...";
+    LOG(INFO) << logPrefix << "Trying to request spill for " << remaining << " 
bytes...";
     auto* mm = memoryManager_->getMemoryManager();
-    uint64_t spilledOut = mm->arbitrator()->shrinkCapacity(remaining); // this 
conducts spilling
+    uint64_t spilledOut = mm->arbitrator()->shrinkCapacity(remaining); // this 
conducts spill
     LOG(INFO) << logPrefix << "Successfully spilled out " << spilledOut << " 
bytes.";
     uint64_t total = shrunken + spilledOut;
     VLOG(2) << logPrefix << "Successfully reclaimed total " << total << " 
bytes.";
     return total;
-  } else {
-    LOG(WARNING) << "Spill-to-disk was disabled since " << kSpillStrategy << " 
was not configured.";
   }
-
+  LOG(WARNING) << "Spill-to-disk was disabled since " << kSpillStrategy << " 
was not configured.";
   VLOG(2) << logPrefix << "Successfully reclaimed total " << shrunken << " 
bytes.";
   return shrunken;
 }
diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh
index 28228224b..8a580e7ba 100755
--- a/ep/build-velox/src/get_velox.sh
+++ b/ep/build-velox/src/get_velox.sh
@@ -17,7 +17,7 @@
 set -exu
 
 VELOX_REPO=https://github.com/oap-project/velox.git
-VELOX_BRANCH=2024_09_14
+VELOX_BRANCH=2024_09_14-2
 VELOX_HOME=""
 
 OS=`uname -s`


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to