This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 90ccc17e8 KUDU-1261 update CFile {en,de}coders for array data type
90ccc17e8 is described below

commit 90ccc17e861d74bb625cbb1d9c52ac9073f21d40
Author: Alexey Serbin <ale...@apache.org>
AuthorDate: Tue Jun 10 10:33:44 2025 -0700

    KUDU-1261 update CFile {en,de}coders for array data type
    
    This changelist introduces a new 'CFileFooterPB.is_type_array' field
    to distinguish CFiles containing array data blocks [1].  When writing
    a CFile with array data blocks, CFile writer implementations must also
    set the corresponding bit in the 'CFileFooterPB.incompatible_features'
    field (the new bit will be introduced in a follow-up changelist)
    along with setting the new 'is_type_array' field to 'true'.
    
    This changelist also updates the encoder/decoder registry logic
    to register and look up encoders/decoders for array data blocks.
    
    [1] https://gerrit.cloudera.org/#/c/22058
    
    Change-Id: Ie84debb6dd8f81ce013cb3330b07c6abe2534c0b
    Reviewed-on: http://gerrit.cloudera.org:8080/23145
    Tested-by: Alexey Serbin <ale...@apache.org>
    Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com>
---
 src/kudu/cfile/cfile.proto       |  8 +++++
 src/kudu/cfile/cfile_reader.cc   |  3 +-
 src/kudu/cfile/encoding-test.cc  | 72 +++++++++++++++++++++++++++++++++++++---
 src/kudu/cfile/type_encodings.cc | 26 +++++++++++++--
 src/kudu/common/types.cc         | 14 ++++++++
 src/kudu/common/types.h          |  4 +++
 6 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/src/kudu/cfile/cfile.proto b/src/kudu/cfile/cfile.proto
index b76759b42..02a1ea3b2 100644
--- a/src/kudu/cfile/cfile.proto
+++ b/src/kudu/cfile/cfile.proto
@@ -79,6 +79,14 @@ message CFileFooterPB {
 
   optional bool is_type_nullable = 8 [default=false];
 
+  // Whether this CFile contains array data blocks: that's the way to represent
+  // one-dimensional arrays of scalar types. If set to 'true', the 'data_type'
+  // field corresponds to the array element type, and there must be
+  // corresponding bit set in the 'incompatible_features' field to prevent
+  // misinterpreting the data in this CFile by readers that do not recognize
+  // array data blocks.
+  optional bool is_type_array = 12 [default=false];
+
   // Block pointer for dictionary block if the cfile is dictionary encoded.
   // Only for dictionary encoding.
   optional BlockPointerPB dict_block_ptr = 9;
diff --git a/src/kudu/cfile/cfile_reader.cc b/src/kudu/cfile/cfile_reader.cc
index 93c8d72fb..ffd24bd5f 100644
--- a/src/kudu/cfile/cfile_reader.cc
+++ b/src/kudu/cfile/cfile_reader.cc
@@ -189,7 +189,8 @@ Status CFileReader::InitOnce(const IOContext* io_context) {
         IncompatibleFeatures::SUPPORTED));
   }
 
-  type_info_ = GetTypeInfo(footer_->data_type());
+  type_info_ = footer_->is_type_array() ? 
GetArrayTypeInfo(footer_->data_type())
+                                        : GetTypeInfo(footer_->data_type());
 
   RETURN_NOT_OK(TypeEncodingInfo::Get(type_info_,
                                       footer_->encoding(),
diff --git a/src/kudu/cfile/encoding-test.cc b/src/kudu/cfile/encoding-test.cc
index 93b8e631a..03fd3bc38 100644
--- a/src/kudu/cfile/encoding-test.cc
+++ b/src/kudu/cfile/encoding-test.cc
@@ -68,6 +68,7 @@ using std::shared_ptr;
 using std::string;
 using std::unique_ptr;
 using std::vector;
+using strings::Substitute;
 
 namespace kudu {
 namespace cfile {
@@ -95,21 +96,32 @@ class TestEncoding : public KuduTest {
     ASSERT_EQ(1, n);
   }
 
-  unique_ptr<BlockBuilder> CreateBlockBuilderOrDie(DataType type,
+  unique_ptr<BlockBuilder> CreateBlockBuilderOrDie(const TypeInfo* type_info,
                                                    EncodingType encoding) {
     const TypeEncodingInfo* tei;
-    CHECK_OK(TypeEncodingInfo::Get(GetTypeInfo(type), encoding, &tei));
+    CHECK_OK(TypeEncodingInfo::Get(type_info, encoding, &tei));
     return tei->CreateBlockBuilder(&default_write_options_);
   }
 
-  static unique_ptr<BlockDecoder> CreateBlockDecoderOrDie(DataType type,
+  unique_ptr<BlockBuilder> CreateBlockBuilderOrDie(DataType type,
+                                                   EncodingType encoding) {
+    return CreateBlockBuilderOrDie(GetTypeInfo(type), encoding);
+  }
+
+  static unique_ptr<BlockDecoder> CreateBlockDecoderOrDie(const TypeInfo* 
type_info,
                                                           EncodingType 
encoding,
                                                           
scoped_refptr<BlockHandle> block) {
     const TypeEncodingInfo* tei;
-    CHECK_OK(TypeEncodingInfo::Get(GetTypeInfo(type), encoding, &tei));
+    CHECK_OK(TypeEncodingInfo::Get(type_info, encoding, &tei));
     return tei->CreateBlockDecoder(std::move(block), 
/*parent_cfile_iter=*/nullptr);
   }
 
+  static unique_ptr<BlockDecoder> CreateBlockDecoderOrDie(DataType type,
+                                                          EncodingType 
encoding,
+                                                          
scoped_refptr<BlockHandle> block) {
+    return CreateBlockDecoderOrDie(GetTypeInfo(type), encoding, 
std::move(block));
+  }
+
   // Insert a given number of strings into the provided BlockBuilder.
   //
   // The strings are generated using the provided 'formatter' function.
@@ -390,7 +402,7 @@ class TestEncoding : public KuduTest {
     ASSERT_OK(ibd->ParseHeader());
 
     // Benchmark seeking
-    LOG_TIMING(INFO, strings::Substitute("Seeking in $0 block", 
TypeTraits<IntType>::name())) {
+    LOG_TIMING(INFO, Substitute("Seeking in $0 block", 
TypeTraits<IntType>::name())) {
       for (int i = 0; i < num_queries; i++) {
         bool exact = false;
         // Seek to a random value which falls between data[0] and 
max_seek_target
@@ -438,6 +450,22 @@ class TestEncoding : public KuduTest {
     ASSERT_FALSE(bd->HasNext());
   }
 
+  void TestEmptyArrayBlockEncodeDecode(DataType type, EncodingType encoding) {
+    const TypeInfo* ti = GetArrayTypeInfo(type);
+    ASSERT_NE(nullptr, ti);
+    auto bb = CreateBlockBuilderOrDie(ti, encoding);
+    scoped_refptr<BlockHandle> block = FinishAndMakeContiguous(bb.get(), 0);
+    ASSERT_GT(block->data().size(), 0);
+    LOG(INFO) << Substitute(
+        "$0 array $1: encoded size for 0 items: $2",
+        DataType_Name(type), EncodingType_Name(encoding), 
block->data().size());
+
+    auto bd = CreateBlockDecoderOrDie(ti, encoding, std::move(block));
+    ASSERT_OK(bd->ParseHeader());
+    ASSERT_EQ(0, bd->Count());
+    ASSERT_FALSE(bd->HasNext());
+  }
+
   template <DataType Type>
   void TestEncodeDecodeTemplateBlockEncoder(const typename 
TypeTraits<Type>::cpp_type* src,
                                             uint32_t size,
@@ -846,6 +874,40 @@ TEST_F(TestEncoding, 
TestBinaryPrefixEmptyBlockEncodeDecode) {
   TestEmptyBlockEncodeDecode(BINARY, PREFIX_ENCODING);
 }
 
+// Test plain encode/decode for blocks of empty 1D arrays. This is to make
+// sure the corresponding encoders/decoders can be resolved as expected.
+TEST_F(TestEncoding, PlainEmptyArrayBlockEncodeDecode) {
+  for (auto elem_type : { UINT8,
+                          INT8,
+                          UINT16,
+                          INT16,
+                          UINT32,
+                          INT32,
+                          UINT64,
+                          INT64,
+                          INT128,
+                          STRING,
+                          BOOL,
+                          FLOAT,
+                          DOUBLE,
+                          BINARY,
+                          UNIXTIME_MICROS,
+                          DECIMAL32,
+                          DECIMAL64,
+                          DECIMAL128,
+                          VARCHAR,
+                          DATE }) {
+    SCOPED_TRACE(Substitute("PLAIN block encoding/decoding for $0",
+                            DataType_Name(elem_type)));
+    NO_FATALS(TestEmptyArrayBlockEncodeDecode(elem_type, PLAIN_ENCODING));
+  }
+}
+
+TEST_F(TestEncoding, BinaryPrefixEmptyArrayBlockEncodeDecode) {
+  NO_FATALS(TestEmptyArrayBlockEncodeDecode(STRING, PREFIX_ENCODING));
+  NO_FATALS(TestEmptyArrayBlockEncodeDecode(BINARY, PREFIX_ENCODING));
+}
+
 // Test encode/decode of a binary block with various-sized truncations.
 TEST_F(TestEncoding, TestBinaryPlainBlockBuilderTruncation) {
   TestBinaryBlockTruncation<BinaryPlainBlockDecoder>(PLAIN_ENCODING);
diff --git a/src/kudu/cfile/type_encodings.cc b/src/kudu/cfile/type_encodings.cc
index 25d7f2922..574a5fd86 100644
--- a/src/kudu/cfile/type_encodings.cc
+++ b/src/kudu/cfile/type_encodings.cc
@@ -247,13 +247,35 @@ class TypeEncodingResolver {
 Status TypeEncodingInfo::Get(const TypeInfo* typeinfo,
                              EncodingType encoding,
                              const TypeEncodingInfo** out) {
+  if (!typeinfo->is_array()) {
+    return Singleton<TypeEncodingResolver>::get()->GetTypeEncodingInfo(
+        typeinfo->physical_type(), encoding, out);
+  }
+
+  // For 1D arrays, the encoder is chosen based on the type of the array's
+  // elements. This logic works only for 1D arrays of scalar types.
+  const auto* elem_type_info = GetArrayElementTypeInfo(*typeinfo);
+  DCHECK(elem_type_info);
+  DCHECK_NE(NESTED, elem_type_info->type());
+
   return Singleton<TypeEncodingResolver>::get()->GetTypeEncodingInfo(
-      typeinfo->physical_type(), encoding, out);
+      elem_type_info->physical_type(), encoding, out);
 }
 
 EncodingType TypeEncodingInfo::GetDefaultEncoding(const TypeInfo* typeinfo) {
+  if (!typeinfo->is_array()) {
+    return Singleton<TypeEncodingResolver>::get()->GetDefaultEncoding(
+        typeinfo->physical_type());
+  }
+
+  // For 1D arrays, the encoder is chosen based on the type of the array's
+  // elements. This logic works only for 1D arrays of scalar types.
+  const auto* elem_type_info = GetArrayElementTypeInfo(*typeinfo);
+  DCHECK(elem_type_info);
+  DCHECK_NE(NESTED, elem_type_info->type());
+
   return Singleton<TypeEncodingResolver>::get()->GetDefaultEncoding(
-      typeinfo->physical_type());
+      elem_type_info->physical_type());
 }
 
 }  // namespace cfile
diff --git a/src/kudu/common/types.cc b/src/kudu/common/types.cc
index 97d4b3b58..be540da41 100644
--- a/src/kudu/common/types.cc
+++ b/src/kudu/common/types.cc
@@ -22,6 +22,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "kudu/gutil/port.h"
 #include "kudu/gutil/singleton.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/walltime.h"
@@ -148,6 +149,19 @@ const TypeInfo* GetArrayTypeInfo(DataType element_type) {
                                                          /*is_nested=*/true);
 }
 
+const TypeInfo* GetArrayElementTypeInfo(const TypeInfo& typeinfo) {
+  const auto* nested_type_desc = typeinfo.nested_type_info();
+  if (PREDICT_FALSE(!nested_type_desc)) {
+    // Not a NESTED type.
+    return nullptr;
+  }
+  if (PREDICT_FALSE(!nested_type_desc->is_array())) {
+    // Non an array.
+    return nullptr;
+  }
+  return DCHECK_NOTNULL(nested_type_desc->array().elem_type_info());
+}
+
 void DataTypeTraits<DATE>::AppendDebugStringForValue(const void* val, string* 
str) {
   constexpr static const char* const kDateFormat = "%F"; // the ISO 8601 date 
format
   static constexpr time_t kSecondsInDay = 24 * 60 * 60;
diff --git a/src/kudu/common/types.h b/src/kudu/common/types.h
index bd8d58d1d..e0e7cd0ac 100644
--- a/src/kudu/common/types.h
+++ b/src/kudu/common/types.h
@@ -189,6 +189,10 @@ class TypeInfo {
   const AreConsecutiveFunc are_consecutive_func_;
 };
 
+// Given TypeInfo of an array, return pointer to the TypeInfo for the array's
+// elements or nullptr if the specified TypeInfo isn't of a NESTED array type.
+const TypeInfo* GetArrayElementTypeInfo(const TypeInfo& typeinfo);
+
 template<DataType Type>
 struct DataTypeTraits {};
 

Reply via email to