This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 06817882b92 [Fix](outfile) Add a configuration for exporting data in
Parquet format using `select into outfile` (#36142)
06817882b92 is described below
commit 06817882b92066ff0c37ff53dcd974d33fc1cee1
Author: Tiewei Fang <[email protected]>
AuthorDate: Wed Jun 12 23:00:22 2024 +0800
[Fix](outfile) Add a configuration for exporting data in Parquet format
using `select into outfile` (#36142)
When exporting data to Parquet, you can specify the minimum row group
size. Add the configuration `min_row_group_size` in be.conf.
---
be/src/common/config.cpp | 3 +++
be/src/common/config.h | 3 +++
be/src/vec/runtime/vparquet_transformer.cpp | 6 +++---
3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index f6e2ca7249d..5290acd01cb 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1298,6 +1298,9 @@ DEFINE_mBool(skip_loading_stale_rowset_meta, "false");
DEFINE_Bool(enable_file_logger, "true");
+// The minimum row group size when exporting Parquet files. default 128MB
+DEFINE_Int64(min_row_group_size, "134217728");
+
// clang-format off
#ifdef BE_TEST
// test s3
diff --git a/be/src/common/config.h b/be/src/common/config.h
index cd7ef3c2f47..785cdfe8be5 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1381,6 +1381,9 @@ DECLARE_mBool(skip_loading_stale_rowset_meta);
// Only works when starting BE with --console.
DECLARE_Bool(enable_file_logger);
+// The minimum row group size when exporting Parquet files.
+DECLARE_Int64(min_row_group_size);
+
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
diff --git a/be/src/vec/runtime/vparquet_transformer.cpp
b/be/src/vec/runtime/vparquet_transformer.cpp
index 77068adeebe..fbb24e00968 100644
--- a/be/src/vec/runtime/vparquet_transformer.cpp
+++ b/be/src/vec/runtime/vparquet_transformer.cpp
@@ -35,6 +35,7 @@
#include <ostream>
#include <string>
+#include "common/config.h"
#include "common/status.h"
#include "gutil/endian.h"
#include "io/fs/file_writer.h"
@@ -70,8 +71,6 @@
namespace doris::vectorized {
-const uint64_t min_row_group_size = 128 * 1024 * 1024; // 128MB
-
ParquetOutputStream::ParquetOutputStream(doris::io::FileWriter* file_writer)
: _file_writer(file_writer), _cur_pos(0), _written_len(0) {
set_mode(arrow::io::FileMode::WRITE);
@@ -247,6 +246,7 @@ Status VParquetTransformer::_parse_properties() {
}
builder.created_by(
fmt::format("{}({})", doris::get_short_version(),
parquet::DEFAULT_CREATED_BY));
+ builder.max_row_group_length(std::numeric_limits<int64_t>::max());
_parquet_writer_properties = builder.build();
_arrow_properties = parquet::ArrowWriterProperties::Builder()
.enable_deprecated_int96_timestamps()
@@ -296,7 +296,7 @@ Status VParquetTransformer::write(const Block& block) {
RETURN_DORIS_STATUS_IF_ERROR(_writer->WriteRecordBatch(*result));
_write_size += block.bytes();
- if (_write_size >= min_row_group_size) {
+ if (_write_size >= doris::config::min_row_group_size) {
RETURN_DORIS_STATUS_IF_ERROR(_writer->NewBufferedRowGroup());
_write_size = 0;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]