This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 18b1930e8 KUDU-1261 use --array_cell_max_elem_num for encoder buffer
estimate
18b1930e8 is described below
commit 18b1930e830609248ee2e26d6216d073ff40ac5d
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Oct 22 19:16:41 2025 -0700
KUDU-1261 use --array_cell_max_elem_num for encoder buffer estimate
This changelist addresses TODO in cfile_writer.cc, using recently
introduced --array_cell_max_elem_num flag instead of hard-coded
constant to estimate the initial buffer size for non-null/validity
RLE encoders.
I also corrected the estimate for the initial size of the memory buffer
used by RLE encoders that build non-null/validity bitmaps.
Change-Id: I99fbe2fcaf68e8a67eebd5d0059571929418a31b
Reviewed-on: http://gerrit.cloudera.org:8080/23581
Reviewed-by: Abhishek Chennaka <[email protected]>
Tested-by: Alexey Serbin <[email protected]>
---
src/kudu/cfile/cfile_writer.cc | 36 +++++++++++++++++-------------------
1 file changed, 17 insertions(+), 19 deletions(-)
diff --git a/src/kudu/cfile/cfile_writer.cc b/src/kudu/cfile/cfile_writer.cc
index 4f5004f50..036fcb51e 100644
--- a/src/kudu/cfile/cfile_writer.cc
+++ b/src/kudu/cfile/cfile_writer.cc
@@ -70,6 +70,8 @@ DEFINE_bool(cfile_support_arrays, true,
"Support encoding/decoding of arrays in CFile data blocks");
TAG_FLAG(cfile_support_arrays, experimental);
+DECLARE_uint32(array_cell_max_elem_num);
+
using google::protobuf::RepeatedPtrField;
using kudu::fs::BlockCreationTransaction;
using kudu::fs::BlockManager;
@@ -89,9 +91,6 @@ const size_t kChecksumSize = sizeof(uint32_t);
static const size_t kMinBlockSize = 512;
-// TODO(aserbin): source this from a flag?
-static const size_t kMaxElementsInArray = 1024;
-
class NonNullBitmapBuilder {
public:
explicit NonNullBitmapBuilder(size_t initial_row_capacity)
@@ -269,13 +268,21 @@ Status CFileWriter::Start() {
data_block_ = type_encoding_info_->CreateBlockBuilder(&options_);
if (is_array_) {
- // Array data blocks allows nullable elements both for array's elements
- // and for arrays cells themselves.
- size_t nrows =
- (options_.storage_attributes.cfile_block_size + typeinfo_->size() - 1)
/
- typeinfo_->size();
- array_non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(nrows));
- non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(nrows *
kMaxElementsInArray));
+ // Array data blocks allows for nullable elements in array cells and
+ // the cells themselves, so both non_null_bitmap_builder_ and
+ // array_non_null_bitmap_builder_ are needed to maintain non-nullness
+ // (a.k.a. validity) metadata.
+ // The initial estimate for the encoders' memory buffers assumes the worst
+ // case of all cells being single element arrays, and also takes into
+ // account the possibility of going over the configured size for the
+ // CFile block (that's why factor 2 appeared below).
+ const size_t cell_max_elem_num = FLAGS_array_cell_max_elem_num + 1;
+ const size_t elem_size = GetArrayElementTypeInfo(*typeinfo_)->size();
+ const size_t nrows =
+ (2 * options_.storage_attributes.cfile_block_size + elem_size - 1) /
elem_size;
+ array_non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(
+ (nrows + cell_max_elem_num - 1) / cell_max_elem_num));
+ non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(nrows));
array_elem_num_builder_.reset(new ArrayElemNumBuilder(nrows));
} else if (is_nullable_) {
size_t nrows =
@@ -505,15 +512,6 @@ Status CFileWriter::AppendNullableArrayEntries(const
uint8_t* bitmap,
array_elem_num_builder_->Add(0);
}
-#if DCHECK_IS_ON()
- // TODO(aserbin): re-enable this extra validation after
SetNull()/set_null()
- // updates the underlying Slice for previously stored
- // non-null cell's data in ContiguousRow and elsewhere
- // if DCHECK_IS_ON()
- //const Slice* cell = cells_ptr;
- //DCHECK(cell->empty());
-#endif // if DCHECK_IS_ON() ...
-
cells_ptr += cells_num;
cur_cell_idx += cells_num;
} else {