This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push: new 90ccc17e8 KUDU-1261 update CFile {en,de}coders for array data type 90ccc17e8 is described below commit 90ccc17e861d74bb625cbb1d9c52ac9073f21d40 Author: Alexey Serbin <ale...@apache.org> AuthorDate: Tue Jun 10 10:33:44 2025 -0700 KUDU-1261 update CFile {en,de}coders for array data type This changelist introduces a new 'CFileFooterPB.is_type_array' field to distinguish CFiles containing array data blocks [1]. When writing a CFile with array data blocks, CFile writer implementations must also set the corresponding bit in the 'CFileFooterPB.incompatible_features' field (the new bit will be introduced in a follow-up changelist) along with setting the new 'is_type_array' field to 'true'. This changelist also updates the encoder/decoder registry logic to register and look up encoders/decoders for array data blocks. [1] https://gerrit.cloudera.org/#/c/22058 Change-Id: Ie84debb6dd8f81ce013cb3330b07c6abe2534c0b Reviewed-on: http://gerrit.cloudera.org:8080/23145 Tested-by: Alexey Serbin <ale...@apache.org> Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com> --- src/kudu/cfile/cfile.proto | 8 +++++ src/kudu/cfile/cfile_reader.cc | 3 +- src/kudu/cfile/encoding-test.cc | 72 +++++++++++++++++++++++++++++++++++++--- src/kudu/cfile/type_encodings.cc | 26 +++++++++++++-- src/kudu/common/types.cc | 14 ++++++++ src/kudu/common/types.h | 4 +++ 6 files changed, 119 insertions(+), 8 deletions(-) diff --git a/src/kudu/cfile/cfile.proto b/src/kudu/cfile/cfile.proto index b76759b42..02a1ea3b2 100644 --- a/src/kudu/cfile/cfile.proto +++ b/src/kudu/cfile/cfile.proto @@ -79,6 +79,14 @@ message CFileFooterPB { optional bool is_type_nullable = 8 [default=false]; + // Whether this CFile contains array data blocks: that's the way to represent + // one-dimensional arrays of scalar types. If set to 'true', the 'data_type' + // field corresponds to the array element type, and there must be + // corresponding bit set in the 'incompatible_features' field to prevent + // misinterpreting the data in this CFile by readers that do not recognize + // array data blocks. + optional bool is_type_array = 12 [default=false]; + // Block pointer for dictionary block if the cfile is dictionary encoded. // Only for dictionary encoding. optional BlockPointerPB dict_block_ptr = 9; diff --git a/src/kudu/cfile/cfile_reader.cc b/src/kudu/cfile/cfile_reader.cc index 93c8d72fb..ffd24bd5f 100644 --- a/src/kudu/cfile/cfile_reader.cc +++ b/src/kudu/cfile/cfile_reader.cc @@ -189,7 +189,8 @@ Status CFileReader::InitOnce(const IOContext* io_context) { IncompatibleFeatures::SUPPORTED)); } - type_info_ = GetTypeInfo(footer_->data_type()); + type_info_ = footer_->is_type_array() ? GetArrayTypeInfo(footer_->data_type()) + : GetTypeInfo(footer_->data_type()); RETURN_NOT_OK(TypeEncodingInfo::Get(type_info_, footer_->encoding(), diff --git a/src/kudu/cfile/encoding-test.cc b/src/kudu/cfile/encoding-test.cc index 93b8e631a..03fd3bc38 100644 --- a/src/kudu/cfile/encoding-test.cc +++ b/src/kudu/cfile/encoding-test.cc @@ -68,6 +68,7 @@ using std::shared_ptr; using std::string; using std::unique_ptr; using std::vector; +using strings::Substitute; namespace kudu { namespace cfile { @@ -95,21 +96,32 @@ class TestEncoding : public KuduTest { ASSERT_EQ(1, n); } - unique_ptr<BlockBuilder> CreateBlockBuilderOrDie(DataType type, + unique_ptr<BlockBuilder> CreateBlockBuilderOrDie(const TypeInfo* type_info, EncodingType encoding) { const TypeEncodingInfo* tei; - CHECK_OK(TypeEncodingInfo::Get(GetTypeInfo(type), encoding, &tei)); + CHECK_OK(TypeEncodingInfo::Get(type_info, encoding, &tei)); return tei->CreateBlockBuilder(&default_write_options_); } - static unique_ptr<BlockDecoder> CreateBlockDecoderOrDie(DataType type, + unique_ptr<BlockBuilder> CreateBlockBuilderOrDie(DataType type, + EncodingType encoding) { + return CreateBlockBuilderOrDie(GetTypeInfo(type), encoding); + } + + static unique_ptr<BlockDecoder> CreateBlockDecoderOrDie(const TypeInfo* type_info, EncodingType encoding, scoped_refptr<BlockHandle> block) { const TypeEncodingInfo* tei; - CHECK_OK(TypeEncodingInfo::Get(GetTypeInfo(type), encoding, &tei)); + CHECK_OK(TypeEncodingInfo::Get(type_info, encoding, &tei)); return tei->CreateBlockDecoder(std::move(block), /*parent_cfile_iter=*/nullptr); } + static unique_ptr<BlockDecoder> CreateBlockDecoderOrDie(DataType type, + EncodingType encoding, + scoped_refptr<BlockHandle> block) { + return CreateBlockDecoderOrDie(GetTypeInfo(type), encoding, std::move(block)); + } + // Insert a given number of strings into the provided BlockBuilder. // // The strings are generated using the provided 'formatter' function. @@ -390,7 +402,7 @@ class TestEncoding : public KuduTest { ASSERT_OK(ibd->ParseHeader()); // Benchmark seeking - LOG_TIMING(INFO, strings::Substitute("Seeking in $0 block", TypeTraits<IntType>::name())) { + LOG_TIMING(INFO, Substitute("Seeking in $0 block", TypeTraits<IntType>::name())) { for (int i = 0; i < num_queries; i++) { bool exact = false; // Seek to a random value which falls between data[0] and max_seek_target @@ -438,6 +450,22 @@ class TestEncoding : public KuduTest { ASSERT_FALSE(bd->HasNext()); } + void TestEmptyArrayBlockEncodeDecode(DataType type, EncodingType encoding) { + const TypeInfo* ti = GetArrayTypeInfo(type); + ASSERT_NE(nullptr, ti); + auto bb = CreateBlockBuilderOrDie(ti, encoding); + scoped_refptr<BlockHandle> block = FinishAndMakeContiguous(bb.get(), 0); + ASSERT_GT(block->data().size(), 0); + LOG(INFO) << Substitute( + "$0 array $1: encoded size for 0 items: $2", + DataType_Name(type), EncodingType_Name(encoding), block->data().size()); + + auto bd = CreateBlockDecoderOrDie(ti, encoding, std::move(block)); + ASSERT_OK(bd->ParseHeader()); + ASSERT_EQ(0, bd->Count()); + ASSERT_FALSE(bd->HasNext()); + } + template <DataType Type> void TestEncodeDecodeTemplateBlockEncoder(const typename TypeTraits<Type>::cpp_type* src, uint32_t size, @@ -846,6 +874,40 @@ TEST_F(TestEncoding, TestBinaryPrefixEmptyBlockEncodeDecode) { TestEmptyBlockEncodeDecode(BINARY, PREFIX_ENCODING); } +// Test plain encode/decode for blocks of empty 1D arrays. This is to make +// sure the corresponding encoders/decoders can be resolved as expected. +TEST_F(TestEncoding, PlainEmptyArrayBlockEncodeDecode) { + for (auto elem_type : { UINT8, + INT8, + UINT16, + INT16, + UINT32, + INT32, + UINT64, + INT64, + INT128, + STRING, + BOOL, + FLOAT, + DOUBLE, + BINARY, + UNIXTIME_MICROS, + DECIMAL32, + DECIMAL64, + DECIMAL128, + VARCHAR, + DATE }) { + SCOPED_TRACE(Substitute("PLAIN block encoding/decoding for $0", + DataType_Name(elem_type))); + NO_FATALS(TestEmptyArrayBlockEncodeDecode(elem_type, PLAIN_ENCODING)); + } +} + +TEST_F(TestEncoding, BinaryPrefixEmptyArrayBlockEncodeDecode) { + NO_FATALS(TestEmptyArrayBlockEncodeDecode(STRING, PREFIX_ENCODING)); + NO_FATALS(TestEmptyArrayBlockEncodeDecode(BINARY, PREFIX_ENCODING)); +} + // Test encode/decode of a binary block with various-sized truncations. TEST_F(TestEncoding, TestBinaryPlainBlockBuilderTruncation) { TestBinaryBlockTruncation<BinaryPlainBlockDecoder>(PLAIN_ENCODING); diff --git a/src/kudu/cfile/type_encodings.cc b/src/kudu/cfile/type_encodings.cc index 25d7f2922..574a5fd86 100644 --- a/src/kudu/cfile/type_encodings.cc +++ b/src/kudu/cfile/type_encodings.cc @@ -247,13 +247,35 @@ class TypeEncodingResolver { Status TypeEncodingInfo::Get(const TypeInfo* typeinfo, EncodingType encoding, const TypeEncodingInfo** out) { + if (!typeinfo->is_array()) { + return Singleton<TypeEncodingResolver>::get()->GetTypeEncodingInfo( + typeinfo->physical_type(), encoding, out); + } + + // For 1D arrays, the encoder is chosen based on the type of the array's + // elements. This logic works only for 1D arrays of scalar types. + const auto* elem_type_info = GetArrayElementTypeInfo(*typeinfo); + DCHECK(elem_type_info); + DCHECK_NE(NESTED, elem_type_info->type()); + return Singleton<TypeEncodingResolver>::get()->GetTypeEncodingInfo( - typeinfo->physical_type(), encoding, out); + elem_type_info->physical_type(), encoding, out); } EncodingType TypeEncodingInfo::GetDefaultEncoding(const TypeInfo* typeinfo) { + if (!typeinfo->is_array()) { + return Singleton<TypeEncodingResolver>::get()->GetDefaultEncoding( + typeinfo->physical_type()); + } + + // For 1D arrays, the encoder is chosen based on the type of the array's + // elements. This logic works only for 1D arrays of scalar types. + const auto* elem_type_info = GetArrayElementTypeInfo(*typeinfo); + DCHECK(elem_type_info); + DCHECK_NE(NESTED, elem_type_info->type()); + return Singleton<TypeEncodingResolver>::get()->GetDefaultEncoding( - typeinfo->physical_type()); + elem_type_info->physical_type()); } } // namespace cfile diff --git a/src/kudu/common/types.cc b/src/kudu/common/types.cc index 97d4b3b58..be540da41 100644 --- a/src/kudu/common/types.cc +++ b/src/kudu/common/types.cc @@ -22,6 +22,7 @@ #include <unordered_map> #include <utility> +#include "kudu/gutil/port.h" #include "kudu/gutil/singleton.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/gutil/walltime.h" @@ -148,6 +149,19 @@ const TypeInfo* GetArrayTypeInfo(DataType element_type) { /*is_nested=*/true); } +const TypeInfo* GetArrayElementTypeInfo(const TypeInfo& typeinfo) { + const auto* nested_type_desc = typeinfo.nested_type_info(); + if (PREDICT_FALSE(!nested_type_desc)) { + // Not a NESTED type. + return nullptr; + } + if (PREDICT_FALSE(!nested_type_desc->is_array())) { + // Non an array. + return nullptr; + } + return DCHECK_NOTNULL(nested_type_desc->array().elem_type_info()); +} + void DataTypeTraits<DATE>::AppendDebugStringForValue(const void* val, string* str) { constexpr static const char* const kDateFormat = "%F"; // the ISO 8601 date format static constexpr time_t kSecondsInDay = 24 * 60 * 60; diff --git a/src/kudu/common/types.h b/src/kudu/common/types.h index bd8d58d1d..e0e7cd0ac 100644 --- a/src/kudu/common/types.h +++ b/src/kudu/common/types.h @@ -189,6 +189,10 @@ class TypeInfo { const AreConsecutiveFunc are_consecutive_func_; }; +// Given TypeInfo of an array, return pointer to the TypeInfo for the array's +// elements or nullptr if the specified TypeInfo isn't of a NESTED array type. +const TypeInfo* GetArrayElementTypeInfo(const TypeInfo& typeinfo); + template<DataType Type> struct DataTypeTraits {};