This is an automated email from the ASF dual-hosted git repository.
marong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new c633f6bcc [VL] Fix generic benchmark usage when both split and data
exists (#4972)
c633f6bcc is described below
commit c633f6bccd5c0471fc25623fe75d03e0b79e95d7
Author: Yang Zhang <[email protected]>
AuthorDate: Mon Mar 18 13:46:13 2024 +0800
[VL] Fix generic benchmark usage when both split and data exists (#4972)
---
cpp/velox/benchmarks/GenericBenchmark.cc | 13 +++++++------
docs/developers/MicroBenchmarks.md | 9 +++++----
2 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/cpp/velox/benchmarks/GenericBenchmark.cc
b/cpp/velox/benchmarks/GenericBenchmark.cc
index 7c21c97c6..8254355b4 100644
--- a/cpp/velox/benchmarks/GenericBenchmark.cc
+++ b/cpp/velox/benchmarks/GenericBenchmark.cc
@@ -56,7 +56,7 @@ DEFINE_string(
split,
"",
"Path to input json file of the splits. Only valid for simulating the
first stage. Use comma-separated list for multiple splits.");
-DEFINE_string(data, "", "Path to input data files in parquet format. Only
valid for simulating the middle stage.");
+DEFINE_string(data, "", "Path to input data files in parquet format, used for
shuffle read.");
DEFINE_string(conf, "", "Path to the configuration file.");
DEFINE_string(write_path, "/tmp", "Path for simulate write task.");
@@ -346,14 +346,13 @@ int main(int argc, char** argv) {
errorMsg = "File path does not exist: " + substraitJsonFile;
} else if (FLAGS_split.empty() && FLAGS_data.empty()) {
errorMsg = "Missing '--split' or '--data' option.";
- } else if (!FLAGS_split.empty() && !FLAGS_data.empty()) {
- errorMsg = "Duplicated option '--split' and '--data'.";
}
try {
if (!FLAGS_data.empty()) {
dataFiles = gluten::splitPaths(FLAGS_data, true);
- } else {
+ }
+ if (!FLAGS_split.empty()) {
splitFiles = gluten::splitPaths(FLAGS_split, true);
}
} catch (const std::exception& e) {
@@ -365,7 +364,8 @@ int main(int argc, char** argv) {
<< "If simulating a first stage, the usage is:" << std::endl
<< "./generic_benchmark "
<< "--plan /absolute-path/to/substrait_json_file "
- << "--split
/absolute-path/to/split_json_file_1,/abosolute-path/to/split_json_file_2,..."
<< std::endl
+ << "--split
/absolute-path/to/split_json_file_1,/abosolute-path/to/split_json_file_2,..."
+ << "--data
/absolute-path/to/data_file_1,/absolute-path/to/data_file_2,..." << std::endl
<< "If simulating a middle stage, the usage is:" << std::endl
<< "./generic_benchmark "
<< "--plan /absolute-path/to/substrait_json_file "
@@ -383,7 +383,8 @@ int main(int argc, char** argv) {
for (const auto& splitFile : splitFiles) {
LOG(INFO) << splitFile;
}
- } else {
+ }
+ if (!dataFiles.empty()) {
LOG(INFO) << "Using " << dataFiles.size() << " input data file(s): ";
for (const auto& dataFile : dataFiles) {
LOG(INFO) << dataFile;
diff --git a/docs/developers/MicroBenchmarks.md
b/docs/developers/MicroBenchmarks.md
index 5c83c76db..5ee017909 100644
--- a/docs/developers/MicroBenchmarks.md
+++ b/docs/developers/MicroBenchmarks.md
@@ -123,11 +123,12 @@ And then re-run the query with below configurations to
dump the inputs to micro
| spark.gluten.saveDir | Directory to save the inputs
to micro benchmark, should exist and be empty.
| /path/to/saveDir |
-Check the files in `spark.gluten.saveDir`. If the simulated stage is a first
stage, you will get 3 types of dumped file:
+Check the files in `spark.gluten.saveDir`. If the simulated stage is a first
stage, which means mixed of scan operator and optional shuffle read, you will
get 4 types of dumped file:
- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the configurations to init Velox
backend and runtime session.
- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait plan to the stage, without input file splits.
-- Split file: JSON formatted, file name
`split_[stageId]_[partitionId]_[splitIndex].json`. There can be more than one
split file in a first stage task. Contains the substrait plan piece to the
input file splits.
+- Split file: JSON formatted, file name
`split_[stageId]_[partitionId]_[splitIndex].json`. There can be more than one
split file used for scan operator, contains the substrait plan piece to the
input file splits.
+- Data file(optional): Parquet formatted, file name
`data_[stageId]_[partitionId]_[iteratorIndex].json`. There can be more than one
input data file used for shuffle read. The input data files of a middle stage
will be loaded as iterators to serve as the inputs for the pipeline:
Run benchmark. By default, the result will be printed to stdout. You can use
`--noprint-result` to suppress this output.
@@ -142,11 +143,11 @@ cd /path/to/gluten/cpp/build/velox/benchmarks
--threads 1 --noprint-result
```
-If the simulated stage is a middle stage, you will get 3 types of dumped file:
+If the simulated stage is a middle stage, which means pure shuffle stage, you
will get 3 types of dumped file:
- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the configurations to init Velox
backend and runtime session.
- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait plan to the stage.
-- Data file: Parquet formatted, file name
`data_[stageId]_[partitionId]_[iteratorIndex].json`. There can be more than one
input data file in a middle stage task. The input data files of a middle stage
will be loaded as iterators to serve as the inputs for the pipeline:
+- Data file: Parquet formatted, file name
`data_[stageId]_[partitionId]_[iteratorIndex].json`. There can be more than one
input data file used for shuffle read. The input data files of a middle stage
will be loaded as iterators to serve as the inputs for the pipeline:
```json
"localFiles": {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]