Re: [PR] [VL] Fix dump benchmark data issue in same task [incubator-gluten]

via GitHub Tue, 22 Apr 2025 03:29:38 -0700


marin-ma commented on code in PR #9360:
URL: https://github.com/apache/incubator-gluten/pull/9360#discussion_r2053689186



##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -284,21 +320,19 @@ std::unique_ptr<ColumnarBatchSerializer> 
VeloxRuntime::createColumnarBatchSerial
   return std::make_unique<VeloxColumnarBatchSerializer>(arrowPool, veloxPool, 
cSchema);
 }
 
-void VeloxRuntime::dumpConf(const std::string& path) {
+void VeloxRuntime::dumpConf(bool dump) {

Review Comment:
   Perhaps use `if` when calling this method and remove this parameter?



##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -307,24 +341,29 @@ void VeloxRuntime::dumpConf(const std::string& path) {
   }
 
   // Write each key-value pair to the file with adjusted spacing for alignment
-  outFile << "[Backend Conf]" << std::endl;
+  out << "[Backend Conf]" << std::endl;
   for (const auto& pair : backendConfMap) {
-    outFile << std::left << std::setw(maxKeyLength + 1) << pair.first << ' ' 
<< pair.second << std::endl;
+    out << std::left << std::setw(maxKeyLength + 1) << pair.first << ' ' << 
pair.second << std::endl;
   }
-  outFile << std::endl << "[Session Conf]" << std::endl;
+  out << std::endl << "[Session Conf]" << std::endl;
   for (const auto& pair : confMap_) {
-    outFile << std::left << std::setw(maxKeyLength + 1) << pair.first << ' ' 
<< pair.second << std::endl;
+    out << std::left << std::setw(maxKeyLength + 1) << pair.first << ' ' << 
pair.second << std::endl;
   }
 
-  outFile.close();
+  auto dumpPath = fmt::format("conf_{}_{}_{}.ini", taskInfo_.stageId, 
taskInfo_.taskId, taskInfo_.vId);
+  dumpToStorage(veloxCfg_, dumpPath, out.str());
 }
 
-std::shared_ptr<ArrowWriter> VeloxRuntime::createArrowWriter(const 
std::string& path) {
-  int64_t batchSize = 4096;
-  if (auto it = confMap_.find(kSparkBatchSize); it != confMap_.end()) {
-    batchSize = std::atol(it->second.c_str());
+std::shared_ptr<ArrowWriter> VeloxRuntime::createArrowWriter(bool dump, 
int32_t idx) {
+  if (!dump) {

Review Comment:
   Same as above. Use if when calling this method



##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -80,10 +108,14 @@ VeloxRuntime::VeloxRuntime(
       kMemoryPoolCapacityTransferAcrossTasks, 
FLAGS_velox_memory_pool_capacity_transfer_across_tasks);
 }
 
-void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size, 
std::optional<std::string> dumpFile) {
-  if (debugModeEnabled_ || dumpFile.has_value()) {
+void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size, bool dump) {
+  if (debugModeEnabled_ || dump) {
     try {
-      auto planJson = substraitFromPbToJson("Plan", data, size, dumpFile);
+      auto planJson = substraitFromPbToJson("Plan", data, size);
+      if (dump) {
+        auto dumpFile = fmt::format("plan_{}_{}_{}.json", taskInfo_.stageId, 
taskInfo_.taskId, taskInfo_.vId);

Review Comment:
   We should use `partitionId` to name the dumped files rather than 
`taskId`remains the same across different runs of a Spark job stage for a 
specific task/partition, but task id can change. For use cases like re-running 
a spark job multiple times, user can easily identify which partition is dumped 
based on the partition id the filename, without looking back to a specific 
spark event log to find the matched task id.



##########
docs/developers/MicroBenchmarks.md:
##########
@@ -128,15 +126,15 @@ And then re-run the query with below configurations to 
dump the inputs to micro
 Check the files in `spark.gluten.saveDir`. If the simulated stage is a first 
stage, you will get 3
 or 4 types of dumped file:
 
-- Configuration file: INI formatted, file name 
`conf_[stageId]_[partitionId].ini`. Contains the
+- Configuration file: INI formatted, file name 
`conf_{stageId}_{taskId}_{vId}.ini`. Contains the

Review Comment:
   Please add some explanations in the document for `vId`. And perhaps it's 
better to use `veloxTaskId` instead.



##########
docs/developers/MicroBenchmarks.md:
##########
@@ -128,15 +126,15 @@ And then re-run the query with below configurations to 
dump the inputs to micro
 Check the files in `spark.gluten.saveDir`. If the simulated stage is a first 
stage, you will get 3
 or 4 types of dumped file:
 
-- Configuration file: INI formatted, file name 
`conf_[stageId]_[partitionId].ini`. Contains the
+- Configuration file: INI formatted, file name 
`conf_{stageId}_{taskId}_{vId}.ini`. Contains the
   configurations to init Velox backend and runtime session.
-- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`. 
Contains the substrait
+- Plan file: JSON formatted, file name `plan_{stageId}_{taskId}_{vId}.json`. 
Contains the substrait
   plan to the stage, without input file splits.
-- Split file: JSON formatted, file name 
`split_[stageId]_[partitionId]_[splitIndex].json`. There can
+- Split file: JSON formatted, file name 
`split_{stageId}_{taskId}_{vId}_{idx}.json`. There can

Review Comment:
   idx -> splitIdx/splitIndex



##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -80,10 +108,14 @@ VeloxRuntime::VeloxRuntime(
       kMemoryPoolCapacityTransferAcrossTasks, 
FLAGS_velox_memory_pool_capacity_transfer_across_tasks);
 }
 
-void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size, 
std::optional<std::string> dumpFile) {
-  if (debugModeEnabled_ || dumpFile.has_value()) {
+void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size, bool dump) {

Review Comment:
   dump -> dumpPlan



##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -94,10 +126,14 @@ void VeloxRuntime::parsePlan(const uint8_t* data, int32_t 
size, std::optional<st
   GLUTEN_CHECK(parseProtobuf(data, size, &substraitPlan_) == true, "Parse 
substrait plan failed");
 }
 
-void VeloxRuntime::parseSplitInfo(const uint8_t* data, int32_t size, 
std::optional<std::string> dumpFile) {
-  if (debugModeEnabled_ || dumpFile.has_value()) {
+void VeloxRuntime::parseSplitInfo(const uint8_t* data, int32_t size, int32_t 
idx, bool dump) {

Review Comment:
   idx -> splitIdx
   dump -> dumpSplit



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [VL] Fix dump benchmark data issue in same task [incubator-gluten]

Reply via email to