This is an automated email from the ASF dual-hosted git repository.

hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 3fbf488ef [VL] Recover broken memory-trace option 
spark.gluten.backtrace.allocation (#6635)
3fbf488ef is described below

commit 3fbf488efa47a373d78fdd59b3dc5be7784dd8e1
Author: Hongze Zhang <[email protected]>
AuthorDate: Fri Aug 2 15:10:44 2024 +0800

    [VL] Recover broken memory-trace option spark.gluten.backtrace.allocation 
(#6635)
---
 cpp/core/jni/JniWrapper.cc                            | 14 +++++++++++---
 cpp/core/memory/AllocationListener.cc                 |  2 --
 cpp/core/memory/AllocationListener.h                  |  2 --
 cpp/velox/compute/VeloxBackend.cc                     |  3 ---
 cpp/velox/config/VeloxConfig.h                        |  3 ---
 docs/developers/HowTo.md                              |  2 +-
 .../main/scala/org/apache/gluten/GlutenConfig.scala   | 19 +++++++++++++++++--
 7 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/cpp/core/jni/JniWrapper.cc b/cpp/core/jni/JniWrapper.cc
index 4fa45d9cb..add4aa54d 100644
--- a/cpp/core/jni/JniWrapper.cc
+++ b/cpp/core/jni/JniWrapper.cc
@@ -215,6 +215,10 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) {
   gluten::getJniCommonState()->close();
 }
 
+namespace {
+const std::string kBacktraceAllocation = 
"spark.gluten.memory.backtrace.allocation";
+}
+
 JNIEXPORT jlong JNICALL 
Java_org_apache_gluten_exec_RuntimeJniWrapper_createRuntime( // NOLINT
     JNIEnv* env,
     jclass,
@@ -226,13 +230,17 @@ JNIEXPORT jlong JNICALL 
Java_org_apache_gluten_exec_RuntimeJniWrapper_createRunt
   if (env->GetJavaVM(&vm) != JNI_OK) {
     throw gluten::GlutenException("Unable to get JavaVM instance");
   }
-
+  auto safeArray = gluten::getByteArrayElementsSafe(env, sessionConf);
+  auto sparkConf = gluten::parseConfMap(env, safeArray.elems(), 
safeArray.length());
   auto backendType = jStringToCString(env, jbackendType);
+
   std::unique_ptr<AllocationListener> listener =
       std::make_unique<SparkAllocationListener>(vm, jlistener, 
reserveMemoryMethod, unreserveMemoryMethod);
+  bool backtrace = sparkConf.at(kBacktraceAllocation) == "true";
+  if (backtrace) {
+    listener = 
std::make_unique<BacktraceAllocationListener>(std::move(listener));
+  }
 
-  auto safeArray = gluten::getByteArrayElementsSafe(env, sessionConf);
-  auto sparkConf = gluten::parseConfMap(env, safeArray.elems(), 
safeArray.length());
   auto runtime = gluten::Runtime::create(backendType, std::move(listener), 
sparkConf);
   return reinterpret_cast<jlong>(runtime);
   JNI_METHOD_END(kInvalidObjectHandle)
diff --git a/cpp/core/memory/AllocationListener.cc 
b/cpp/core/memory/AllocationListener.cc
index 2c876e9f1..5cbeeb6bd 100644
--- a/cpp/core/memory/AllocationListener.cc
+++ b/cpp/core/memory/AllocationListener.cc
@@ -19,8 +19,6 @@
 
 namespace gluten {
 
-bool backtrace_allocation = false;
-
 class NoopAllocationListener : public gluten::AllocationListener {
  public:
   void allocationChanged(int64_t diff) override {
diff --git a/cpp/core/memory/AllocationListener.h 
b/cpp/core/memory/AllocationListener.h
index 41797641f..1751b6112 100644
--- a/cpp/core/memory/AllocationListener.h
+++ b/cpp/core/memory/AllocationListener.h
@@ -23,8 +23,6 @@
 
 namespace gluten {
 
-extern bool backtrace_allocation;
-
 class AllocationListener {
  public:
   static std::unique_ptr<AllocationListener> noop();
diff --git a/cpp/velox/compute/VeloxBackend.cc 
b/cpp/velox/compute/VeloxBackend.cc
index a3658faa3..2dad6adf2 100644
--- a/cpp/velox/compute/VeloxBackend.cc
+++ b/cpp/velox/compute/VeloxBackend.cc
@@ -104,9 +104,6 @@ void VeloxBackend::init(const 
std::unordered_map<std::string, std::string>& conf
   FLAGS_gluten_velox_aysnc_timeout_on_task_stopping =
       backendConf_->get<int32_t>(kVeloxAsyncTimeoutOnTaskStopping, 
kVeloxAsyncTimeoutOnTaskStoppingDefault);
 
-  // Set backtrace_allocation
-  gluten::backtrace_allocation = backendConf_->get<bool>(kBacktraceAllocation, 
false);
-
   // Setup and register.
   velox::filesystems::registerLocalFileSystem();
   initJolFilesystem();
diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
index 65c7cb61d..792beda96 100644
--- a/cpp/velox/config/VeloxConfig.h
+++ b/cpp/velox/config/VeloxConfig.h
@@ -104,9 +104,6 @@ const int32_t kVeloxAsyncTimeoutOnTaskStoppingDefault = 
30000; // 30s
 // udf
 const std::string kVeloxUdfLibraryPaths = 
"spark.gluten.sql.columnar.backend.velox.internal.udfLibraryPaths";
 
-// backtrace allocation
-const std::string kBacktraceAllocation = "spark.gluten.backtrace.allocation";
-
 // VeloxShuffleReader print flag.
 const std::string kVeloxShuffleReaderPrintFlag = 
"spark.gluten.velox.shuffleReaderPrintFlag";
 
diff --git a/docs/developers/HowTo.md b/docs/developers/HowTo.md
index a13bf02eb..5b16c965f 100644
--- a/docs/developers/HowTo.md
+++ b/docs/developers/HowTo.md
@@ -163,7 +163,7 @@ wait to add
 
 # How to track the memory exhaust problem
 
-When your gluten spark jobs failed because of OOM, you can track the memory 
allocation's call stack by configuring `spark.gluten.backtrace.allocation = 
true`.
+When your gluten spark jobs failed because of OOM, you can track the memory 
allocation's call stack by configuring 
`spark.gluten.memory.backtrace.allocation = true`.
 The above configuration will use `BacktraceAllocationListener` wrapping from 
`SparkAllocationListener` to create `VeloxMemoryManager`.
 
 `BacktraceAllocationListener` will check every allocation, if a single 
allocation bytes exceeds a fixed value or the accumulative allocation bytes 
exceeds 1/2/3...G,
diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala 
b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
index 820978112..eb32a25e6 100644
--- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
@@ -241,6 +241,8 @@ class GlutenConfig(conf: SQLConf) extends Logging {
 
   def memoryIsolation: Boolean = conf.getConf(COLUMNAR_MEMORY_ISOLATION)
 
+  def memoryBacktraceAllocation: Boolean = 
conf.getConf(COLUMNAR_MEMORY_BACKTRACE_ALLOCATION)
+
   def numTaskSlotsPerExecutor: Int = {
     val numSlots = conf.getConf(NUM_TASK_SLOTS_PER_EXECUTOR)
     assert(numSlots > 0, s"Number of task slot not found. This should not 
happen.")
@@ -659,7 +661,10 @@ object GlutenConfig {
 
     val keyWithDefault = ImmutableList.of(
       (SQLConf.CASE_SENSITIVE.key, SQLConf.CASE_SENSITIVE.defaultValueString),
-      (SQLConf.IGNORE_MISSING_FILES.key, 
SQLConf.IGNORE_MISSING_FILES.defaultValueString)
+      (SQLConf.IGNORE_MISSING_FILES.key, 
SQLConf.IGNORE_MISSING_FILES.defaultValueString),
+      (
+        COLUMNAR_MEMORY_BACKTRACE_ALLOCATION.key,
+        COLUMNAR_MEMORY_BACKTRACE_ALLOCATION.defaultValueString)
     )
     keyWithDefault.forEach(e => nativeConfMap.put(e._1, conf.getOrElse(e._1, 
e._2)))
 
@@ -706,7 +711,9 @@ object GlutenConfig {
       (AWS_S3_RETRY_MODE.key, AWS_S3_RETRY_MODE.defaultValueString),
       (
         COLUMNAR_VELOX_CONNECTOR_IO_THREADS.key,
-        conf.getOrElse(GLUTEN_NUM_TASK_SLOTS_PER_EXECUTOR_KEY, "-1")),
+        conf.getOrElse(
+          NUM_TASK_SLOTS_PER_EXECUTOR.key,
+          NUM_TASK_SLOTS_PER_EXECUTOR.defaultValueString)),
       (COLUMNAR_SHUFFLE_CODEC.key, ""),
       (COLUMNAR_SHUFFLE_CODEC_BACKEND.key, ""),
       ("spark.hadoop.input.connect.timeout", "180000"),
@@ -1244,6 +1251,14 @@ object GlutenConfig {
       .booleanConf
       .createWithDefault(false)
 
+  val COLUMNAR_MEMORY_BACKTRACE_ALLOCATION =
+    buildConf("spark.gluten.memory.backtrace.allocation")
+      .internal()
+      .doc("Print backtrace information for large memory allocations. This 
helps debugging when " +
+        "Spark OOM happens due to large acquire requests.")
+      .booleanConf
+      .createWithDefault(false)
+
   val COLUMNAR_MEMORY_OVER_ACQUIRED_RATIO =
     buildConf("spark.gluten.memory.overAcquiredMemoryRatio")
       .internal()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to