marin-ma commented on code in PR #9360:
URL: https://github.com/apache/incubator-gluten/pull/9360#discussion_r2053689186
##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -284,21 +320,19 @@ std::unique_ptr<ColumnarBatchSerializer>
VeloxRuntime::createColumnarBatchSerial
return std::make_unique<VeloxColumnarBatchSerializer>(arrowPool, veloxPool,
cSchema);
}
-void VeloxRuntime::dumpConf(const std::string& path) {
+void VeloxRuntime::dumpConf(bool dump) {
Review Comment:
Perhaps use `if` when calling this method and remove this parameter?
##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -307,24 +341,29 @@ void VeloxRuntime::dumpConf(const std::string& path) {
}
// Write each key-value pair to the file with adjusted spacing for alignment
- outFile << "[Backend Conf]" << std::endl;
+ out << "[Backend Conf]" << std::endl;
for (const auto& pair : backendConfMap) {
- outFile << std::left << std::setw(maxKeyLength + 1) << pair.first << ' '
<< pair.second << std::endl;
+ out << std::left << std::setw(maxKeyLength + 1) << pair.first << ' ' <<
pair.second << std::endl;
}
- outFile << std::endl << "[Session Conf]" << std::endl;
+ out << std::endl << "[Session Conf]" << std::endl;
for (const auto& pair : confMap_) {
- outFile << std::left << std::setw(maxKeyLength + 1) << pair.first << ' '
<< pair.second << std::endl;
+ out << std::left << std::setw(maxKeyLength + 1) << pair.first << ' ' <<
pair.second << std::endl;
}
- outFile.close();
+ auto dumpPath = fmt::format("conf_{}_{}_{}.ini", taskInfo_.stageId,
taskInfo_.taskId, taskInfo_.vId);
+ dumpToStorage(veloxCfg_, dumpPath, out.str());
}
-std::shared_ptr<ArrowWriter> VeloxRuntime::createArrowWriter(const
std::string& path) {
- int64_t batchSize = 4096;
- if (auto it = confMap_.find(kSparkBatchSize); it != confMap_.end()) {
- batchSize = std::atol(it->second.c_str());
+std::shared_ptr<ArrowWriter> VeloxRuntime::createArrowWriter(bool dump,
int32_t idx) {
+ if (!dump) {
Review Comment:
Same as above. Use if when calling this method
##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -80,10 +108,14 @@ VeloxRuntime::VeloxRuntime(
kMemoryPoolCapacityTransferAcrossTasks,
FLAGS_velox_memory_pool_capacity_transfer_across_tasks);
}
-void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size,
std::optional<std::string> dumpFile) {
- if (debugModeEnabled_ || dumpFile.has_value()) {
+void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size, bool dump) {
+ if (debugModeEnabled_ || dump) {
try {
- auto planJson = substraitFromPbToJson("Plan", data, size, dumpFile);
+ auto planJson = substraitFromPbToJson("Plan", data, size);
+ if (dump) {
+ auto dumpFile = fmt::format("plan_{}_{}_{}.json", taskInfo_.stageId,
taskInfo_.taskId, taskInfo_.vId);
Review Comment:
We should use `partitionId` to name the dumped files rather than
`taskId`remains the same across different runs of a Spark job stage for a
specific task/partition, but task id can change. For use cases like re-running
a spark job multiple times, user can easily identify which partition is dumped
based on the partition id the filename, without looking back to a specific
spark event log to find the matched task id.
##########
docs/developers/MicroBenchmarks.md:
##########
@@ -128,15 +126,15 @@ And then re-run the query with below configurations to
dump the inputs to micro
Check the files in `spark.gluten.saveDir`. If the simulated stage is a first
stage, you will get 3
or 4 types of dumped file:
-- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the
+- Configuration file: INI formatted, file name
`conf_{stageId}_{taskId}_{vId}.ini`. Contains the
Review Comment:
Please add some explanations in the document for `vId`. And perhaps it's
better to use `veloxTaskId` instead.
##########
docs/developers/MicroBenchmarks.md:
##########
@@ -128,15 +126,15 @@ And then re-run the query with below configurations to
dump the inputs to micro
Check the files in `spark.gluten.saveDir`. If the simulated stage is a first
stage, you will get 3
or 4 types of dumped file:
-- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the
+- Configuration file: INI formatted, file name
`conf_{stageId}_{taskId}_{vId}.ini`. Contains the
configurations to init Velox backend and runtime session.
-- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait
+- Plan file: JSON formatted, file name `plan_{stageId}_{taskId}_{vId}.json`.
Contains the substrait
plan to the stage, without input file splits.
-- Split file: JSON formatted, file name
`split_[stageId]_[partitionId]_[splitIndex].json`. There can
+- Split file: JSON formatted, file name
`split_{stageId}_{taskId}_{vId}_{idx}.json`. There can
Review Comment:
idx -> splitIdx/splitIndex
##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -80,10 +108,14 @@ VeloxRuntime::VeloxRuntime(
kMemoryPoolCapacityTransferAcrossTasks,
FLAGS_velox_memory_pool_capacity_transfer_across_tasks);
}
-void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size,
std::optional<std::string> dumpFile) {
- if (debugModeEnabled_ || dumpFile.has_value()) {
+void VeloxRuntime::parsePlan(const uint8_t* data, int32_t size, bool dump) {
Review Comment:
dump -> dumpPlan
##########
cpp/velox/compute/VeloxRuntime.cc:
##########
@@ -94,10 +126,14 @@ void VeloxRuntime::parsePlan(const uint8_t* data, int32_t
size, std::optional<st
GLUTEN_CHECK(parseProtobuf(data, size, &substraitPlan_) == true, "Parse
substrait plan failed");
}
-void VeloxRuntime::parseSplitInfo(const uint8_t* data, int32_t size,
std::optional<std::string> dumpFile) {
- if (debugModeEnabled_ || dumpFile.has_value()) {
+void VeloxRuntime::parseSplitInfo(const uint8_t* data, int32_t size, int32_t
idx, bool dump) {
Review Comment:
idx -> splitIdx
dump -> dumpSplit
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]