wgtmac commented on code in PR #14341: URL: https://github.com/apache/arrow/pull/14341#discussion_r1257268192
########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; Review Comment: ```suggestion constexpr int kMinPrefixLength = 0; constexpr int kMaxPrefixLength = 100; constexpr int kMaxElementLength = 1000; constexpr double kPrefixedProbability = 0.5; constexpr int kSeed = 42; auto rand = ::arrow::random::RandomArrayGenerator(kSeed); ``` Better to follow the convention. ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, Review Comment: `int repeats` seems not used? ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } + + void CheckRoundtrip() override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, FLBAType>; +TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); + +TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + // TODO + + // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); + // + // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0.5)); +} + +template <typename Type> +class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> { + public: + std::unique_ptr<TypedEncoder<Type>> encoder = + MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY); + std::unique_ptr<TypedDecoder<Type>> decoder = + MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY); + + void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + typename EncodingTraits<Type>::Accumulator acc; + if (::arrow::is_string(array->type()->id())) { + acc.builder = std::make_unique<::arrow::StringBuilder>(); + } else { + acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } + + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(array->length()), + static_cast<int>(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + + ::arrow::AssertArraysEqual(*array, *result); + } + + void CheckRoundtrip() override { + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const int32_t num_unique = 10; + const double null_probability = 0.25; + + ::arrow::random::RandomArrayGenerator rag{42}; + std::shared_ptr<::arrow::Array> values = + rag.String(0, min_length, max_length, null_probability); + CheckDirectPut(values); + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + values = rag.String(size, min_length, max_length, null_probability); + CheckDirectPut(values); + + values = rag.BinaryWithRepeats(size, num_unique, min_length, max_length, + null_probability); + CheckDirectPut(values); + } + } + + void Execute() { CheckRoundtrip(); } + + protected: + USING_BASE_MEMBERS(); +}; + +using DeltaByteArrayEncodingDirectPutTypes = + ::testing::Types<ByteArrayType>; // TODO: FLBAType +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, DeltaByteArrayEncodingDirectPutTypes); + +TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { + ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); +} + +TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { + auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, + std::shared_ptr<Buffer> encoded) { Review Comment: Same for below. ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); Review Comment: ```suggestion const auto array = rand.UInt8(size, /*min=*/0, /*max=*/100, null_probability); ``` ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } + + void CheckRoundtrip() override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, FLBAType>; +TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); + +TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + // TODO + + // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); + // + // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0.5)); +} + +template <typename Type> +class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> { + public: + std::unique_ptr<TypedEncoder<Type>> encoder = + MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY); + std::unique_ptr<TypedDecoder<Type>> decoder = + MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY); + + void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + typename EncodingTraits<Type>::Accumulator acc; + if (::arrow::is_string(array->type()->id())) { + acc.builder = std::make_unique<::arrow::StringBuilder>(); + } else { + acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } + + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(array->length()), + static_cast<int>(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + + ::arrow::AssertArraysEqual(*array, *result); + } + + void CheckRoundtrip() override { + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const int32_t num_unique = 10; + const double null_probability = 0.25; + + ::arrow::random::RandomArrayGenerator rag{42}; Review Comment: ```suggestion constexpr int64_t kSize = 50; constexpr int32_t kMinLength = 0; constexpr int32_t kMaxLength = 10; constexpr int32_t kNumUnique = 10; constexpr double kNullProbability = 0.25; constexpr int kSeed = 42; ::arrow::random::RandomArrayGenerator rag{kSeed}; ``` ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } + + void CheckRoundtrip() override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, FLBAType>; +TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); + +TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + // TODO + + // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); + // + // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0.5)); Review Comment: Could we uncomment these lines now? ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } + + void CheckRoundtrip() override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, FLBAType>; +TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); + +TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + // TODO + + // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); + // + // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0.5)); +} + +template <typename Type> +class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> { + public: + std::unique_ptr<TypedEncoder<Type>> encoder = + MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY); + std::unique_ptr<TypedDecoder<Type>> decoder = + MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY); + + void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + typename EncodingTraits<Type>::Accumulator acc; + if (::arrow::is_string(array->type()->id())) { + acc.builder = std::make_unique<::arrow::StringBuilder>(); + } else { + acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } + + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(array->length()), + static_cast<int>(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + + ::arrow::AssertArraysEqual(*array, *result); + } + + void CheckRoundtrip() override { + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const int32_t num_unique = 10; + const double null_probability = 0.25; + + ::arrow::random::RandomArrayGenerator rag{42}; + std::shared_ptr<::arrow::Array> values = + rag.String(0, min_length, max_length, null_probability); + CheckDirectPut(values); + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + values = rag.String(size, min_length, max_length, null_probability); + CheckDirectPut(values); + + values = rag.BinaryWithRepeats(size, num_unique, min_length, max_length, + null_probability); + CheckDirectPut(values); + } + } + + void Execute() { CheckRoundtrip(); } + + protected: + USING_BASE_MEMBERS(); +}; + +using DeltaByteArrayEncodingDirectPutTypes = + ::testing::Types<ByteArrayType>; // TODO: FLBAType +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, DeltaByteArrayEncodingDirectPutTypes); + +TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { + ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); +} + +TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { + auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, + std::shared_ptr<Buffer> encoded) { + auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY); + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + ASSERT_TRUE(encoded->Equals(*buf)); + }; + + auto arrayToI32 = [](const std::shared_ptr<::arrow::Array>& lengths) { Review Comment: ```suggestion auto ArrayToInt32Vector = [](const std::shared_ptr<::arrow::Array>& lengths) { ``` ########## cpp/src/parquet/encoding_test.cc: ########## @@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template <typename Type> +class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void InitData(int nvalues, double null_probability) { + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = static_cast<int>(array->length() - array->null_count()); + draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } + + void CheckRoundtrip() override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, FLBAType>; +TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); + +TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + // TODO + + // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); + // + // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0.5)); +} + +template <typename Type> +class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> { + public: + std::unique_ptr<TypedEncoder<Type>> encoder = + MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY); + std::unique_ptr<TypedDecoder<Type>> decoder = + MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY); + + void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + typename EncodingTraits<Type>::Accumulator acc; + if (::arrow::is_string(array->type()->id())) { + acc.builder = std::make_unique<::arrow::StringBuilder>(); + } else { + acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } + + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(array->length()), + static_cast<int>(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + + ::arrow::AssertArraysEqual(*array, *result); + } + + void CheckRoundtrip() override { + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const int32_t num_unique = 10; + const double null_probability = 0.25; + + ::arrow::random::RandomArrayGenerator rag{42}; + std::shared_ptr<::arrow::Array> values = + rag.String(0, min_length, max_length, null_probability); + CheckDirectPut(values); + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + values = rag.String(size, min_length, max_length, null_probability); + CheckDirectPut(values); + + values = rag.BinaryWithRepeats(size, num_unique, min_length, max_length, + null_probability); + CheckDirectPut(values); + } + } + + void Execute() { CheckRoundtrip(); } + + protected: + USING_BASE_MEMBERS(); +}; + +using DeltaByteArrayEncodingDirectPutTypes = + ::testing::Types<ByteArrayType>; // TODO: FLBAType +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, DeltaByteArrayEncodingDirectPutTypes); + +TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { + ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); +} + +TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { + auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, + std::shared_ptr<Buffer> encoded) { Review Comment: ```suggestion auto CheckEncode = [](const std::shared_ptr<::arrow::Array>& values, const std::shared_ptr<Buffer>& encoded) { ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org