rok commented on code in PR #14341:
URL: https://github.com/apache/arrow/pull/14341#discussion_r1269606187


##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1908,4 +1907,304 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) 
{
   CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), 
values));
 }
 
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY encode/decode tests.
+
+template <typename Type>
+class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> {
+ public:
+  using c_type = typename Type::c_type;
+  static constexpr int TYPE = Type::type_num;
+
+  void InitData(int nvalues, int repeats, double prefixed_probability) {
+    num_values_ = nvalues * repeats;
+    input_bytes_.resize(num_values_ * sizeof(c_type));
+    output_bytes_.resize(num_values_ * sizeof(c_type));
+    draws_ = reinterpret_cast<c_type*>(input_bytes_.data());
+    decode_buf_ = reinterpret_cast<c_type*>(output_bytes_.data());
+    GeneratePrefixedData<c_type>(nvalues, draws_, &data_buffer_, 
prefixed_probability);
+
+    // add some repeated values
+    for (int j = 1; j < repeats; ++j) {
+      for (int i = 0; i < nvalues; ++i) {
+        draws_[nvalues * j + i] = draws_[i];
+      }
+    }
+  }
+
+  void Execute(int nvalues, int repeats, double prefixed_probability) {
+    InitData(nvalues, repeats, prefixed_probability);
+    CheckRoundtrip();
+  }
+
+  void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset,
+                     double null_probability, double prefixed_probability) {
+    InitData(nvalues, repeats, prefixed_probability);
+
+    int64_t size = num_values_ + valid_bits_offset;
+    auto rand = ::arrow::random::RandomArrayGenerator(1923);
+    const auto array = rand.UInt8(size, /*min=*/0, /*max=*/100, 
null_probability);
+    const auto valid_bits = array->null_bitmap_data();
+    if (valid_bits) {
+      CheckRoundtripSpaced(valid_bits, valid_bits_offset);
+    }
+  }
+
+  void CheckRoundtrip() override {
+    auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY,
+                                          /*use_dictionary=*/false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, 
descr_.get());
+
+    encoder->Put(draws_, num_values_);
+    encode_buffer_ = encoder->FlushValues();
+
+    decoder->SetData(num_values_, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    int values_decoded = decoder->Decode(decode_buf_, num_values_);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, 
num_values_));
+  }
+
+  void CheckRoundtripSpaced(const uint8_t* valid_bits,
+                            int64_t valid_bits_offset) override {
+    auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY,
+                                          /*use_dictionary=*/false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, 
descr_.get());
+    int null_count = 0;
+    for (auto i = 0; i < num_values_; i++) {
+      if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+        null_count++;
+      }
+    }
+
+    encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
+    encode_buffer_ = encoder->FlushValues();
+    decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 
null_count,
+                                                valid_bits, valid_bits_offset);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, 
num_values_,
+                                                        valid_bits, 
valid_bits_offset));
+  }
+
+ protected:
+  USING_BASE_MEMBERS();
+  std::vector<uint8_t> input_bytes_;
+  std::vector<uint8_t> output_bytes_;
+};
+
+using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, 
FLBAType>;
+TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes);
+
+TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) {
+  ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0, 
/*prefixed_probability=*/0.1));
+  ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5, 0.2));
+  ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1, 0.3));
+  ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+      /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, 
/*null_probability*/
+      0, 0.4));
+  ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+      /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64,
+      /*null_probability*/ 0.5, 0.5));
+}
+
+template <typename Type>
+class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> {
+ public:
+  std::unique_ptr<TypedEncoder<Type>> encoder =
+      MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY);
+  std::unique_ptr<TypedDecoder<Type>> decoder =
+      MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY);
+
+  void CheckDirectPut(std::shared_ptr<::arrow::Array> array) {
+    ASSERT_NO_THROW(encoder->Put(*array));
+    auto buf = encoder->FlushValues();
+
+    int num_values = static_cast<int>(array->length() - array->null_count());
+    decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size()));
+
+    typename EncodingTraits<Type>::Accumulator acc;
+    using BuilderType = typename EncodingTraits<Type>::BuilderType;
+    acc.builder = std::make_unique<BuilderType>(array->type(), 
default_memory_pool());
+
+    ASSERT_EQ(num_values,
+              decoder->DecodeArrow(static_cast<int>(array->length()),
+                                   static_cast<int>(array->null_count()),
+                                   array->null_bitmap_data(), array->offset(), 
&acc));
+
+    std::shared_ptr<::arrow::Array> result;
+    ASSERT_OK(acc.builder->Finish(&result));
+    ASSERT_EQ(array->length(), result->length());
+    ASSERT_OK(result->ValidateFull());
+
+    ::arrow::AssertArraysEqual(*array, *result);
+  }
+
+  void CheckRoundtripFLBA() {
+    constexpr int64_t kSize = 50;
+    constexpr int kSeed = 42;
+    constexpr int kByteWidth = 4;
+    ::arrow::random::RandomArrayGenerator rag{kSeed};
+    std::shared_ptr<::arrow::Array> values =
+        rag.FixedSizeBinary(/*size=*/0, /*byte_width=*/kByteWidth);
+    CheckDirectPut(values);
+
+    for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
+      rag = ::arrow::random::RandomArrayGenerator(seed);
+      values = rag.FixedSizeBinary(kSize + seed, kByteWidth);
+      CheckDirectPut(values);
+    }

Review Comment:
   Changed.



##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1908,4 +1907,304 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) 
{
   CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), 
values));
 }
 
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY encode/decode tests.
+
+template <typename Type>
+class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> {
+ public:
+  using c_type = typename Type::c_type;
+  static constexpr int TYPE = Type::type_num;
+
+  void InitData(int nvalues, int repeats, double prefixed_probability) {
+    num_values_ = nvalues * repeats;
+    input_bytes_.resize(num_values_ * sizeof(c_type));
+    output_bytes_.resize(num_values_ * sizeof(c_type));
+    draws_ = reinterpret_cast<c_type*>(input_bytes_.data());
+    decode_buf_ = reinterpret_cast<c_type*>(output_bytes_.data());
+    GeneratePrefixedData<c_type>(nvalues, draws_, &data_buffer_, 
prefixed_probability);
+
+    // add some repeated values
+    for (int j = 1; j < repeats; ++j) {
+      for (int i = 0; i < nvalues; ++i) {
+        draws_[nvalues * j + i] = draws_[i];
+      }
+    }
+  }
+
+  void Execute(int nvalues, int repeats, double prefixed_probability) {
+    InitData(nvalues, repeats, prefixed_probability);
+    CheckRoundtrip();
+  }
+
+  void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset,
+                     double null_probability, double prefixed_probability) {
+    InitData(nvalues, repeats, prefixed_probability);
+
+    int64_t size = num_values_ + valid_bits_offset;
+    auto rand = ::arrow::random::RandomArrayGenerator(1923);
+    const auto array = rand.UInt8(size, /*min=*/0, /*max=*/100, 
null_probability);
+    const auto valid_bits = array->null_bitmap_data();
+    if (valid_bits) {
+      CheckRoundtripSpaced(valid_bits, valid_bits_offset);
+    }
+  }
+
+  void CheckRoundtrip() override {
+    auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY,
+                                          /*use_dictionary=*/false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, 
descr_.get());
+
+    encoder->Put(draws_, num_values_);
+    encode_buffer_ = encoder->FlushValues();
+
+    decoder->SetData(num_values_, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    int values_decoded = decoder->Decode(decode_buf_, num_values_);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, 
num_values_));
+  }
+
+  void CheckRoundtripSpaced(const uint8_t* valid_bits,
+                            int64_t valid_bits_offset) override {
+    auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY,
+                                          /*use_dictionary=*/false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, 
descr_.get());
+    int null_count = 0;
+    for (auto i = 0; i < num_values_; i++) {
+      if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+        null_count++;
+      }
+    }
+
+    encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
+    encode_buffer_ = encoder->FlushValues();
+    decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 
null_count,
+                                                valid_bits, valid_bits_offset);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, 
num_values_,
+                                                        valid_bits, 
valid_bits_offset));
+  }
+
+ protected:
+  USING_BASE_MEMBERS();
+  std::vector<uint8_t> input_bytes_;
+  std::vector<uint8_t> output_bytes_;
+};
+
+using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, 
FLBAType>;
+TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes);
+
+TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) {
+  ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0, 
/*prefixed_probability=*/0.1));
+  ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5, 0.2));
+  ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1, 0.3));
+  ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+      /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, 
/*null_probability*/
+      0, 0.4));
+  ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+      /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64,
+      /*null_probability*/ 0.5, 0.5));
+}
+
+template <typename Type>
+class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> {
+ public:
+  std::unique_ptr<TypedEncoder<Type>> encoder =
+      MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY);
+  std::unique_ptr<TypedDecoder<Type>> decoder =
+      MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY);
+
+  void CheckDirectPut(std::shared_ptr<::arrow::Array> array) {
+    ASSERT_NO_THROW(encoder->Put(*array));
+    auto buf = encoder->FlushValues();
+
+    int num_values = static_cast<int>(array->length() - array->null_count());
+    decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size()));
+
+    typename EncodingTraits<Type>::Accumulator acc;
+    using BuilderType = typename EncodingTraits<Type>::BuilderType;
+    acc.builder = std::make_unique<BuilderType>(array->type(), 
default_memory_pool());
+
+    ASSERT_EQ(num_values,
+              decoder->DecodeArrow(static_cast<int>(array->length()),
+                                   static_cast<int>(array->null_count()),
+                                   array->null_bitmap_data(), array->offset(), 
&acc));
+
+    std::shared_ptr<::arrow::Array> result;
+    ASSERT_OK(acc.builder->Finish(&result));
+    ASSERT_EQ(array->length(), result->length());
+    ASSERT_OK(result->ValidateFull());
+
+    ::arrow::AssertArraysEqual(*array, *result);
+  }
+
+  void CheckRoundtripFLBA() {
+    constexpr int64_t kSize = 50;
+    constexpr int kSeed = 42;
+    constexpr int kByteWidth = 4;
+    ::arrow::random::RandomArrayGenerator rag{kSeed};
+    std::shared_ptr<::arrow::Array> values =
+        rag.FixedSizeBinary(/*size=*/0, /*byte_width=*/kByteWidth);
+    CheckDirectPut(values);
+
+    for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
+      rag = ::arrow::random::RandomArrayGenerator(seed);
+      values = rag.FixedSizeBinary(kSize + seed, kByteWidth);
+      CheckDirectPut(values);
+    }
+  }
+
+  void CheckRoundtripByteArray() {
+    constexpr int64_t kSize = 500;
+    constexpr int32_t kMinLength = 0;
+    constexpr int32_t kMaxLength = 10;
+    constexpr int32_t kNumUnique = 10;
+    constexpr double kNullProbability = 0.25;
+    constexpr int kSeed = 42;
+    ::arrow::random::RandomArrayGenerator rag{kSeed};
+    std::shared_ptr<::arrow::Array> values = rag.BinaryWithRepeats(
+        /*size=*/1, /*unique=*/1, kMinLength, kMaxLength, kNullProbability);
+    CheckDirectPut(values);
+
+    for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
+      rag = ::arrow::random::RandomArrayGenerator(seed);
+      values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength,
+                                     kNullProbability);
+      CheckDirectPut(values);
+    }
+  }
+
+  void CheckRoundtrip() override {
+    using ArrowType = typename EncodingTraits<Type>::ArrowType;
+    using IsFixedSizeBinary = ::arrow::is_fixed_size_binary_type<ArrowType>;
+
+    if constexpr (IsFixedSizeBinary::value) {
+      CheckRoundtripFLBA();
+    } else {
+      CheckRoundtripByteArray();
+    }
+  }
+
+ protected:
+  USING_BASE_MEMBERS();
+};
+
+TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, 
TestDeltaByteArrayEncodingTypes);
+
+TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) {
+  ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip());
+}
+
+TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) {
+  auto CheckEncode = [](const std::shared_ptr<::arrow::Array>& values,
+                        const std::shared_ptr<Buffer>& encoded) {
+    auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
+    ASSERT_NO_THROW(encoder->Put(*values));
+    auto buf = encoder->FlushValues();
+    ASSERT_TRUE(encoded->Equals(*buf));
+  };
+
+  auto ArrayToInt32Vector = [](const std::shared_ptr<::arrow::Array>& lengths) 
{
+    std::vector<int32_t> arrays;

Review Comment:
   Changed to vector.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to