This is an automated email from the ASF dual-hosted git repository.
marong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 1d13e4a88 [VL] Update micro benchmark and doc (#4959)
1d13e4a88 is described below
commit 1d13e4a887e0f02f4438fe0284aa72aec5d8d304
Author: Rong Ma <[email protected]>
AuthorDate: Thu Mar 21 18:46:08 2024 +0800
[VL] Update micro benchmark and doc (#4959)
---
cpp/velox/benchmarks/GenericBenchmark.cc | 15 +-
cpp/velox/benchmarks/common/BenchmarkUtils.cc | 2 -
cpp/velox/benchmarks/common/BenchmarkUtils.h | 2 -
cpp/velox/compute/VeloxRuntime.cc | 16 +-
docs/developers/MicroBenchmarks.md | 210 ++++++++++++++++----------
5 files changed, 147 insertions(+), 98 deletions(-)
diff --git a/cpp/velox/benchmarks/GenericBenchmark.cc
b/cpp/velox/benchmarks/GenericBenchmark.cc
index 8254355b4..a9084943d 100644
--- a/cpp/velox/benchmarks/GenericBenchmark.cc
+++ b/cpp/velox/benchmarks/GenericBenchmark.cc
@@ -41,6 +41,9 @@
using namespace gluten;
namespace {
+
+DEFINE_bool(print_result, true, "Print result for execution");
+DEFINE_string(save_output, "", "Path to parquet file for saving the task
output iterator");
DEFINE_bool(with_shuffle, false, "Add shuffle split at end.");
DEFINE_string(partitioning, "rr", "Short partitioning name. Valid options are
rr, hash, range, single");
DEFINE_bool(celeborn, false, "Mocking celeborn shuffle.");
@@ -58,7 +61,7 @@ DEFINE_string(
"Path to input json file of the splits. Only valid for simulating the
first stage. Use comma-separated list for multiple splits.");
DEFINE_string(data, "", "Path to input data files in parquet format, used for
shuffle read.");
DEFINE_string(conf, "", "Path to the configuration file.");
-DEFINE_string(write_path, "/tmp", "Path for simulate write task.");
+DEFINE_string(write_path, "/tmp", "Path to save the output from write tasks.");
struct WriterMetrics {
int64_t splitTime;
@@ -199,9 +202,9 @@ auto BM_Generic = [](::benchmark::State& state,
ArrowSchema cSchema;
toArrowSchema(veloxPlan->outputType(),
memoryManager->getLeafMemoryPool().get(), &cSchema);
GLUTEN_ASSIGN_OR_THROW(auto outputSchema, arrow::ImportSchema(&cSchema));
- ArrowWriter writer{FLAGS_write_file};
+ ArrowWriter writer{FLAGS_save_output};
state.PauseTiming();
- if (!FLAGS_write_file.empty()) {
+ if (!FLAGS_save_output.empty()) {
GLUTEN_THROW_NOT_OK(writer.initWriter(*(outputSchema.get())));
}
state.ResumeTiming();
@@ -217,13 +220,13 @@ auto BM_Generic = [](::benchmark::State& state,
if (FLAGS_print_result) {
LOG(INFO) << maybeBatch.ValueOrDie()->ToString();
}
- if (!FLAGS_write_file.empty()) {
+ if (!FLAGS_save_output.empty()) {
GLUTEN_THROW_NOT_OK(writer.writeInBatches(maybeBatch.ValueOrDie()));
}
}
state.PauseTiming();
- if (!FLAGS_write_file.empty()) {
+ if (!FLAGS_save_output.empty()) {
GLUTEN_THROW_NOT_OK(writer.closeWriter());
}
state.ResumeTiming();
@@ -412,7 +415,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "iterations: " << FLAGS_iterations;
LOG(INFO) << "cpu: " << FLAGS_cpu;
LOG(INFO) << "print_result: " << FLAGS_print_result;
- LOG(INFO) << "write_file: " << FLAGS_write_file;
+ LOG(INFO) << "save_output: " << FLAGS_save_output;
LOG(INFO) << "batch_size: " << FLAGS_batch_size;
LOG(INFO) << "write_path: " << FLAGS_write_path;
diff --git a/cpp/velox/benchmarks/common/BenchmarkUtils.cc
b/cpp/velox/benchmarks/common/BenchmarkUtils.cc
index 888df451a..efe1fc60a 100644
--- a/cpp/velox/benchmarks/common/BenchmarkUtils.cc
+++ b/cpp/velox/benchmarks/common/BenchmarkUtils.cc
@@ -26,8 +26,6 @@
using namespace facebook;
namespace fs = std::filesystem;
-DEFINE_bool(print_result, true, "Print result for execution");
-DEFINE_string(write_file, "", "Write the output to parquet file, file absolute
path");
DEFINE_int64(batch_size, 4096, "To set
velox::core::QueryConfig::kPreferredOutputBatchSize.");
DEFINE_int32(cpu, -1, "Run benchmark on specific CPU");
DEFINE_int32(threads, 1, "The number of threads to run this benchmark");
diff --git a/cpp/velox/benchmarks/common/BenchmarkUtils.h
b/cpp/velox/benchmarks/common/BenchmarkUtils.h
index f06f5bbd5..79f4a53cb 100644
--- a/cpp/velox/benchmarks/common/BenchmarkUtils.h
+++ b/cpp/velox/benchmarks/common/BenchmarkUtils.h
@@ -36,8 +36,6 @@
#include "utils/exception.h"
#include "velox/common/memory/Memory.h"
-DECLARE_bool(print_result);
-DECLARE_string(write_file);
DECLARE_int64(batch_size);
DECLARE_int32(cpu);
DECLARE_int32(threads);
diff --git a/cpp/velox/compute/VeloxRuntime.cc
b/cpp/velox/compute/VeloxRuntime.cc
index a77ddfbba..d1b1bbda0 100644
--- a/cpp/velox/compute/VeloxRuntime.cc
+++ b/cpp/velox/compute/VeloxRuntime.cc
@@ -51,10 +51,11 @@ void VeloxRuntime::parsePlan(
SparkTaskInfo taskInfo,
std::optional<std::string> dumpFile) {
taskInfo_ = taskInfo;
- if (debugModeEnabled_) {
+ if (debugModeEnabled_ || dumpFile.has_value()) {
try {
auto planJson = substraitFromPbToJson("Plan", data, size, dumpFile);
- LOG(INFO) << std::string(50, '#') << " received substrait::Plan: " <<
taskInfo_ << std::endl << planJson;
+ LOG_IF(INFO, debugModeEnabled_) << std::string(50, '#') << " received
substrait::Plan: " << taskInfo_ << std::endl
+ << planJson;
} catch (const std::exception& e) {
LOG(WARNING) << "Error converting Substrait plan to JSON: " << e.what();
}
@@ -64,11 +65,12 @@ void VeloxRuntime::parsePlan(
}
void VeloxRuntime::parseSplitInfo(const uint8_t* data, int32_t size,
std::optional<std::string> dumpFile) {
- if (debugModeEnabled_) {
+ if (debugModeEnabled_ || dumpFile.has_value()) {
try {
auto splitJson = substraitFromPbToJson("ReadRel.LocalFiles", data, size,
dumpFile);
- LOG(INFO) << std::string(50, '#') << " received
substrait::ReadRel.LocalFiles: " << taskInfo_ << std::endl
- << splitJson;
+ LOG_IF(INFO, debugModeEnabled_) << std::string(50, '#')
+ << " received
substrait::ReadRel.LocalFiles: " << taskInfo_ << std::endl
+ << splitJson;
} catch (const std::exception& e) {
LOG(WARNING) << "Error converting Substrait plan to JSON: " << e.what();
}
@@ -116,9 +118,7 @@ std::shared_ptr<ResultIterator>
VeloxRuntime::createResultIterator(
const std::string& spillDir,
const std::vector<std::shared_ptr<ResultIterator>>& inputs,
const std::unordered_map<std::string, std::string>& sessionConf) {
- if (debugModeEnabled_) {
- LOG(INFO) << "VeloxRuntime session config:" << printConfig(confMap_);
- }
+ LOG_IF(INFO, debugModeEnabled_) << "VeloxRuntime session config:" <<
printConfig(confMap_);
VeloxPlanConverter veloxPlanConverter(
inputs, getLeafVeloxPool(memoryManager).get(), sessionConf,
writeFilesTempPath_);
diff --git a/docs/developers/MicroBenchmarks.md
b/docs/developers/MicroBenchmarks.md
index 7a18430fe..62a2f09e9 100644
--- a/docs/developers/MicroBenchmarks.md
+++ b/docs/developers/MicroBenchmarks.md
@@ -9,26 +9,28 @@ parent: Developer Overview
**This document explains how to use the existing micro benchmark template in
Gluten Cpp.**
-A micro benchmark for Velox backend is provided in Gluten Cpp to simulate the
execution of a first or middle stage in
-Spark.
-It serves as a more convenient alternative to debug in Gluten Cpp comparing
with directly debugging in a Spark job.
-Developers can use it to create their own workloads, debug in native process,
profile the hotspot and do optimizations.
+A micro benchmark for Velox backend is provided in Gluten Cpp to simulate the
execution of a first
+or middle stage in Spark. It serves as a more convenient alternative to debug
in Gluten Cpp
+comparing with directly debugging in a Spark job. Developers can use it to
create their own
+workloads, debug in native process, profile the hotspot and do optimizations.
-To simulate a first stage, you need to dump the Substrait plan and input split
info into two JSON files. The input URIs
-of the splits should be exising file locations, which can be either local or
HDFS paths.
+To simulate a first stage, you need to dump the Substrait plan and input split
info into two JSON
+files. The input URIs of the splits should be exising file locations, which
can be either local or
+HDFS paths.
-To simulate a middle stage, in addition to the JSON file, you also need to
save the input data of this stage into
-Parquet files.
-The benchmark will load the data into Arrow format, then add Arrow2Velox to
feed
-the data into Velox pipeline to reproduce the reducer stage. Shuffle exchange
is not included.
+To simulate a middle stage, in addition to the JSON file, you also need to
save the input data of
+this stage into Parquet files. The benchmark will load the data into Arrow
format, then add
+Arrow2Velox to feed the data into Velox pipeline to reproduce the reducer
stage. Shuffle exchange is
+not included.
-Please refer to the sections below to learn how to dump the Substrait plan and
create the input data files.
+Please refer to the sections below to learn how to dump the Substrait plan and
create the input data
+files.
## Try the example
-To run a micro benchmark, user should provide one file that contains the
Substrait plan in JSON format, and optional
-one or more input data files in parquet format.
-The commands below help to generate example input files:
+To run a micro benchmark, user should provide one file that contains the
Substrait plan in JSON
+format, and optional one or more input data files in parquet format. The
commands below help to
+generate example input files:
```shell
cd /path/to/gluten/
@@ -53,7 +55,8 @@ gluten/backends-velox/generated-native-benchmark/
└── _SUCCESS
```
-Run micro benchmark with the generated files as input. You need to specify the
**absolute** path to the input files:
+Run micro benchmark with the generated files as input. You need to specify the
**absolute** path to
+the input files:
```shell
cd /path/to/gluten/cpp/build/velox/benchmarks
@@ -115,22 +118,40 @@ cd /path/to/gluten/
First, get the Stage Id from spark UI for the stage you want to simulate.
And then re-run the query with below configurations to dump the inputs to
micro benchmark.
-| Parameters | Description
| Recommend Setting |
-|---------------------------------------------|----------------------------------------------------------------------------------------------------------------|-------------------|
-| spark.gluten.sql.benchmark_task.stageId | Spark task stage id
| target stage id |
-| spark.gluten.sql.benchmark_task.partitionId | Spark task partition id,
default value -1 means all the partition of this stage
| 0 |
-| spark.gluten.sql.benchmark_task.taskId | If not specify partition id,
use spark task attempt id, default value -1 means all the partition of this
stage | target task attemp id |
-| spark.gluten.saveDir | Directory to save the inputs
to micro benchmark, should exist and be empty.
| /path/to/saveDir |
+| Parameters | Description
| Recommend Setting |
+|---------------------------------------------|----------------------------------------------------------------------------------------------------------------|-----------------------|
+| spark.gluten.sql.benchmark_task.stageId | Spark task stage id
| target stage id |
+| spark.gluten.sql.benchmark_task.partitionId | Spark task partition id,
default value -1 means all the partition of this stage
| 0 |
+| spark.gluten.sql.benchmark_task.taskId | If not specify partition id,
use spark task attempt id, default value -1 means all the partition of this
stage | target task attemp id |
+| spark.gluten.saveDir | Directory to save the inputs
to micro benchmark, should exist and be empty.
| /path/to/saveDir |
+
+Check the files in `spark.gluten.saveDir`. If the simulated stage is a first
stage, you will get 3
+or 4 types of dumped file:
+
+- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the
+ configurations to init Velox backend and runtime session.
+- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait
+ plan to the stage, without input file splits.
+- Split file: JSON formatted, file name
`split_[stageId]_[partitionId]_[splitIndex].json`. There can
+ be more than one split file in a first stage task. Contains the substrait
plan piece to the input
+ file splits.
+- Data file(optional): Parquet formatted, file
+ name `data_[stageId]_[partitionId]_[iteratorIndex].parquet`. If the first
stage contains one or
+ more BHJ operators, there can be one or more input data files. The input
data files of a first
+ stage will be loaded as iterators to serve as the inputs for the pipeline:
+```
+"localFiles": {
+ "items": [
+ {
+ "uriFile": "iterator:0"
+ }
+ ]
+}
+```
-Check the files in `spark.gluten.saveDir`. If the simulated stage is a first
stage, which means mixed of scan operator and optional shuffle read, you will
get 4 types of dumped file:
-
-- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the configurations to init Velox
backend and runtime session.
-- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait plan to the stage, without input file splits.
-- Split file: JSON formatted, file name
`split_[stageId]_[partitionId]_[splitIndex].json`. There can be more than one
split file used for scan operator, contains the substrait plan piece to the
input file splits.
-- Data file(optional): Parquet formatted, file name
`data_[stageId]_[partitionId]_[iteratorIndex].json`. There can be more than one
input data file used for shuffle read. The input data files of a middle stage
will be loaded as iterators to serve as the inputs for the pipeline:
-
-Run benchmark. By default, the result will be printed to stdout. You can use
`--noprint-result` to suppress this output.
+Run benchmark. By default, the result will be printed to stdout. You can use
`--noprint-result` to
+suppress this output.
Sample command:
@@ -139,23 +160,37 @@ cd /path/to/gluten/cpp/build/velox/benchmarks
./generic_benchmark \
--conf /absolute_path/to/conf_[stageId]_[partitionId].ini \
--plan /absolute_path/to/plan_[stageId]_[partitionId].json \
---split
/absolut_path/to/split_[stageId]_[partitionId]_0.parquet,/absolut_path/to/split_[stageId]_[partitionId]_1.parquet
\
+--split
/absolut_path/to/split_[stageId]_[partitionId]_0.json,/absolut_path/to/split_[stageId]_[partitionId]_1.json
\
+--threads 1 --noprint-result
+
+# If the stage requires data files, use --data-file to specify the absolute
path.
+cd /path/to/gluten/cpp/build/velox/benchmarks
+./generic_benchmark \
+--conf /absolute_path/to/conf_[stageId]_[partitionId].ini \
+--plan /absolute_path/to/plan_[stageId]_[partitionId].json \
+--split
/absolut_path/to/split_[stageId]_[partitionId]_0.json,/absolut_path/to/split_[stageId]_[partitionId]_1.json
\
+--data
/absolut_path/to/data_[stageId]_[partitionId]_0.parquet,/absolut_path/to/data_[stageId]_[partitionId]_1.parquet
\
--threads 1 --noprint-result
```
-If the simulated stage is a middle stage, which means pure shuffle stage, you
will get 3 types of dumped file:
+If the simulated stage is a middle stage, which means pure shuffle stage, you
will get 3 types of
+dumped file:
-- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the configurations to init Velox
backend and runtime session.
-- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait plan to the stage.
-- Data file: Parquet formatted, file name
`data_[stageId]_[partitionId]_[iteratorIndex].json`. There can be more than one
input data file used for shuffle read. The input data files of a middle stage
will be loaded as iterators to serve as the inputs for the pipeline:
+- Configuration file: INI formatted, file name
`conf_[stageId]_[partitionId].ini`. Contains the
+ configurations to init Velox backend and runtime session.
+- Plan file: JSON formatted, file name `plan_[stageId]_[partitionId].json`.
Contains the substrait
+ plan to the stage.
+- Data file: Parquet formatted, file name
`data_[stageId]_[partitionId]_[iteratorIndex].parquet`.
+ There can be more than one input data file in a middle stage task. The input
data files of a
+ middle stage will be loaded as iterators to serve as the inputs for the
pipeline:
-```json
+```
"localFiles": {
-"items": [
-{
-"uriFile": "iterator:0"
-}
-]
+ "items": [
+ {
+ "uriFile": "iterator:0"
+ }
+ ]
}
```
@@ -170,8 +205,8 @@ cd /path/to/gluten/cpp/build/velox/benchmarks
--threads 1 --noprint-result
```
-For some complex queries, stageId may cannot represent the Substrait plan
input, please get the taskId from spark UI,
-and get your target parquet from saveDir.
+For some complex queries, stageId may cannot represent the Substrait plan
input, please get the
+taskId from spark UI, and get your target parquet from saveDir.
In this example, only one partition input with partition id 2, taskId is 36,
iterator length is 2.
@@ -183,44 +218,55 @@ cd /path/to/gluten/cpp/build/velox/benchmarks
--threads 1 --noprint-result
```
-## Save ouput to parquet to analyze
+## Save output to parquet for analysis
+
+You can save the output to a parquet file via `--save-output <output>`
-You can save the output to a parquet file to analyze.
+Note: 1. This option cannot be used together with `--with-shuffle`. 2. This
option cannot be used
+for write tasks. Please refer to section [Simulate write
tasks](#simulate-write-tasks) for more
+details.
```shell
cd /path/to/gluten/cpp/build/velox/benchmarks
./generic_benchmark \
--plan /absolute_path/to/plan.json \
--data /absolute_path/to/data.parquet
---threads 1 --noprint-result --write-file=/absolute_path/to/result.parquet
+--threads 1 --noprint-result --save-output /absolute_path/to/result.parquet
```
-## Simulate write task
+## Add shuffle write process
-Write path can be specified by `--write_path` option, default is /tmp.
+You can add the shuffle write process at the end of the pipeline via
`--with-shuffle`
+
+Note: 1. This option cannot be used together with `--save-output`. 2. This
option cannot be used
+for write tasks. Please refer to section [Simulate write
tasks](#simulate-write-tasks) for more
+details.
```shell
cd /path/to/gluten/cpp/build/velox/benchmarks
./generic_benchmark \
--plan /absolute_path/to/plan.json \
--split /absolute_path/to/split.json \
---write_path /absolute_path/<dir>
+--threads 1 --noprint-result --with-shuffle
```
-## Add shuffle write process
+## Simulate write tasks
-You can add the shuffle write process at the end of this stage. Note that this
will ignore the `--write-file` option.
+The last operator for a write task is a file write operator, and the output
from Velox pipeline only
+contains several columns of statistics data. Therefore, specifying
+options `--with-shuffle` and `--save-output` does not take effect. You can
specify the output path
+for the writer via `--write-path` option. Default is /tmp.
```shell
cd /path/to/gluten/cpp/build/velox/benchmarks
./generic_benchmark \
--plan /absolute_path/to/plan.json \
--split /absolute_path/to/split.json \
---threads 1 --noprint-result --with-shuffle
+--write-path /absolute_path/<dir>
```
-By default, the compression codec for shuffle outputs is LZ4. You can switch
to other codecs by adding one of the
-following argument flags to the command:
+By default, the compression codec for shuffle outputs is LZ4. You can switch
to other codecs by
+adding one of the following argument flags to the command:
- --zstd: ZSTD codec, compression level 1
- --qat-gzip: QAT GZIP codec, compression level 1
@@ -228,19 +274,19 @@ following argument flags to the command:
- --iaa-gzip: IAA GZIP codec, compression level 1
Note using QAT or IAA codec requires Gluten cpp is built with these features.
-Please check the corresponding section in [Velox
document](../get-started/Velox.md) first for how to setup, build and
-enable these features in Gluten.
-For QAT support, please
+Please check the corresponding section in [Velox
document](../get-started/Velox.md) first for how to
+setup, build and enable these features in Gluten. For QAT support, please
check [Intel® QuickAssist Technology (QAT)
support](../get-started/Velox.md#intel-quickassist-technology-qat-support).
For IAA support, please
check [Intel® In-memory Analytics Accelerator (IAA/IAX)
support](../get-started/Velox.md#intel-in-memory-analytics-accelerator-iaaiax-support)
## Simulate Spark with multiple processes and threads
-You can use below command to launch several processes and threads to simulate
parallel execution on Spark. Each thread
-in the same process will be pinned to the core number starting from `--cpu`.
+You can use below command to launch several processes and threads to simulate
parallel execution on
+Spark. Each thread in the same process will be pinned to the core number
starting from `--cpu`.
-Suppose running on a baremetal machine with 48C, 2-socket, HT-on, launching
below command will utilize all vcores.
+Suppose running on a baremetal machine with 48C, 2-socket, HT-on, launching
below command will
+utilize all vcores.
```shell
processes=24 # Same value of spark.executor.instances
@@ -251,9 +297,9 @@ for ((i=0; i<${processes}; i++)); do
done
```
-If you want to add the shuffle write process, you can specify multiple
direcotries by setting environment
-variable `GLUTEN_SPARK_LOCAL_DIRS` to a comma-separated string for shuffle
write to spread the I/O pressure to multiple
-disks.
+If you want to add the shuffle write process, you can specify multiple
directories by setting
+environment variable `GLUTEN_SPARK_LOCAL_DIRS` to a comma-separated string for
shuffle write to
+spread the I/O pressure to multiple disks.
```shell
mkdir -p {/data1,/data2,/data3}/tmp # Make sure each directory has been
already created.
@@ -270,29 +316,32 @@ done
### Run Examples
We also provide some example inputs in
[cpp/velox/benchmarks/data](../../cpp/velox/benchmarks/data).
-E.g.
[generic_q5/q5_first_stage_0.json](../../cpp/velox/benchmarks/data/generic_q5/q5_first_stage_0.json)
simulates a
-first-stage in TPCH Q5, which has the the most heaviest table scan. You can
follow below steps to run this example.
+E.g. Files under [generic_q5](../../cpp/velox/benchmarks/data/generic_q5)
simulates a first-stage in
+TPCH Q5, which has a heavy table scan. You can follow below steps to run this
example.
+
+1.
-1. Open
[generic_q5/q5_first_stage_0.json](../../cpp/velox/benchmarks/data/generic_q5/q5_first_stage_0_split.json)
with
- file editor. Search for `"uriFile": "LINEITEM"` and replace `LINEITEM` with
the URI to one partition file in
- lineitem. In the next line, replace the number in `"length": "..."` with
the actual file length. Suppose you are
- using the provided small TPCH table
- in
[cpp/velox/benchmarks/data/tpch_sf10m](../../cpp/velox/benchmarks/data/tpch_sf10m),
the replaced JSON should be
- like:
+Open
[generic_q5/q5_first_stage_0_split.json](../../cpp/velox/benchmarks/data/generic_q5/q5_first_stage_0_split.json)
+with file editor. Search for `"uriFile": "LINEITEM"` and replace `LINEITEM`
with the URI to one
+partition file in lineitem. In the next line, replace the number in `"length":
"..."` with the
+actual file length. Suppose you are using the provided small TPCH table
+in
[cpp/velox/benchmarks/data/tpch_sf10m](../../cpp/velox/benchmarks/data/tpch_sf10m),
the replaced
+JSON should be like:
```
{
- "items": [
- {
- "uriFile":
"file:///path/to/gluten/cpp/velox/benchmarks/data/tpch_sf10m/lineitem/part-00000-6c374e0a-7d76-401b-8458-a8e31f8ab704-c000.snappy.parquet",
- "length": "1863237",
- "parquet": {}
- }
- ]
+ "items": [
+ {
+ "uriFile":
"file:///path/to/gluten/cpp/velox/benchmarks/data/tpch_sf10m/lineitem/part-00000-6c374e0a-7d76-401b-8458-a8e31f8ab704-c000.snappy.parquet",
+ "length": "1863237",
+ "parquet": {}
+ }
+ ]
}
```
-2. Launch multiple processes and multiple threads. Set
`GLUTEN_SPARK_LOCAL_DIRS` and add --with-shuffle to the command.
+2. Launch multiple processes and multiple threads. Set
`GLUTEN_SPARK_LOCAL_DIRS` and add
+ `--with-shuffle` to the command.
```
mkdir -p {/data1,/data2,/data3}/tmp # Make sure each directory has been
already created.
@@ -306,8 +355,9 @@ for ((i=0; i<${processes}; i++)); do
done >stdout.log 2>stderr.log
```
-You can find the "elapsed_time" and other metrics in stdout.log. In below
output, the "elapsed_time" is ~10.75s. If you
-run TPCH Q5 with Gluten on Spark, a single task in the same Spark stage should
take about the same time.
+You can find the "elapsed_time" and other metrics in stdout.log. In below
output, the "elapsed_time"
+is ~10.75s. If you run TPCH Q5 with Gluten on Spark, a single task in the same
Spark stage should
+take about the same time.
```
------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]