This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 18b1930e8 KUDU-1261 use --array_cell_max_elem_num for encoder buffer 
estimate
18b1930e8 is described below

commit 18b1930e830609248ee2e26d6216d073ff40ac5d
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Oct 22 19:16:41 2025 -0700

    KUDU-1261 use --array_cell_max_elem_num for encoder buffer estimate
    
    This changelist addresses TODO in cfile_writer.cc, using recently
    introduced --array_cell_max_elem_num flag instead of hard-coded
    constant to estimate the initial buffer size for non-null/validity
    RLE encoders.
    
    I also corrected the estimate for the initial size of the memory buffer
    used by RLE encoders that build non-null/validity bitmaps.
    
    Change-Id: I99fbe2fcaf68e8a67eebd5d0059571929418a31b
    Reviewed-on: http://gerrit.cloudera.org:8080/23581
    Reviewed-by: Abhishek Chennaka <[email protected]>
    Tested-by: Alexey Serbin <[email protected]>
---
 src/kudu/cfile/cfile_writer.cc | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/kudu/cfile/cfile_writer.cc b/src/kudu/cfile/cfile_writer.cc
index 4f5004f50..036fcb51e 100644
--- a/src/kudu/cfile/cfile_writer.cc
+++ b/src/kudu/cfile/cfile_writer.cc
@@ -70,6 +70,8 @@ DEFINE_bool(cfile_support_arrays, true,
             "Support encoding/decoding of arrays in CFile data blocks");
 TAG_FLAG(cfile_support_arrays, experimental);
 
+DECLARE_uint32(array_cell_max_elem_num);
+
 using google::protobuf::RepeatedPtrField;
 using kudu::fs::BlockCreationTransaction;
 using kudu::fs::BlockManager;
@@ -89,9 +91,6 @@ const size_t kChecksumSize = sizeof(uint32_t);
 
 static const size_t kMinBlockSize = 512;
 
-// TODO(aserbin): source this from a flag?
-static const size_t kMaxElementsInArray = 1024;
-
 class NonNullBitmapBuilder {
  public:
   explicit NonNullBitmapBuilder(size_t initial_row_capacity)
@@ -269,13 +268,21 @@ Status CFileWriter::Start() {
   data_block_ = type_encoding_info_->CreateBlockBuilder(&options_);
 
   if (is_array_) {
-    // Array data blocks allows nullable elements both for array's elements
-    // and for arrays cells themselves.
-    size_t nrows =
-        (options_.storage_attributes.cfile_block_size + typeinfo_->size() - 1) 
/
-        typeinfo_->size();
-    array_non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(nrows));
-    non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(nrows * 
kMaxElementsInArray));
+    // Array data blocks allows for nullable elements in array cells and
+    // the cells themselves, so both non_null_bitmap_builder_ and
+    // array_non_null_bitmap_builder_ are needed to maintain non-nullness
+    // (a.k.a. validity) metadata.
+    // The initial estimate for the encoders' memory buffers assumes the worst
+    // case of all cells being single element arrays, and also takes into
+    // account the possibility of going over the configured size for the
+    // CFile block (that's why factor 2 appeared below).
+    const size_t cell_max_elem_num = FLAGS_array_cell_max_elem_num + 1;
+    const size_t elem_size = GetArrayElementTypeInfo(*typeinfo_)->size();
+    const size_t nrows =
+        (2 * options_.storage_attributes.cfile_block_size + elem_size - 1) / 
elem_size;
+    array_non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(
+        (nrows + cell_max_elem_num - 1) / cell_max_elem_num));
+    non_null_bitmap_builder_.reset(new NonNullBitmapBuilder(nrows));
     array_elem_num_builder_.reset(new ArrayElemNumBuilder(nrows));
   } else if (is_nullable_) {
     size_t nrows =
@@ -505,15 +512,6 @@ Status CFileWriter::AppendNullableArrayEntries(const 
uint8_t* bitmap,
         array_elem_num_builder_->Add(0);
       }
 
-#if DCHECK_IS_ON()
-      // TODO(aserbin): re-enable this extra validation after 
SetNull()/set_null()
-      //                updates the underlying Slice for previously stored
-      //                non-null cell's data in ContiguousRow and elsewhere
-      //                if DCHECK_IS_ON()
-      //const Slice* cell = cells_ptr;
-      //DCHECK(cell->empty());
-#endif // if DCHECK_IS_ON() ...
-
       cells_ptr += cells_num;
       cur_cell_idx += cells_num;
     } else {

Reply via email to