rok commented on code in PR #14191:
URL: https://github.com/apache/arrow/pull/14191#discussion_r1042303944
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1276,5 +1282,108 @@ TEST(ByteStreamSplitEncodeDecode, InvalidDataTypes) {
ASSERT_THROW(MakeTypedDecoder<FLBAType>(Encoding::BYTE_STREAM_SPLIT),
ParquetException);
}
+// ----------------------------------------------------------------------
+// DELTA_BINARY_PACKED encode/decode tests.
+
+template <typename Type>
+class TestDeltaBitPackEncoding : public TestEncodingBase<Type> {
+ public:
+ using c_type = typename Type::c_type;
+ static constexpr int TYPE = Type::type_num;
+
+ void InitBoundData(int nvalues, int repeats) {
+ num_values_ = nvalues * repeats;
+ input_bytes_.resize(num_values_ * sizeof(c_type));
+ output_bytes_.resize(num_values_ * sizeof(c_type));
+ draws_ = reinterpret_cast<c_type*>(input_bytes_.data());
+ decode_buf_ = reinterpret_cast<c_type*>(output_bytes_.data());
+ GenerateBoundData<c_type>(nvalues, draws_, -10, 10, &data_buffer_);
+
+ // add some repeated values
+ for (int j = 1; j < repeats; ++j) {
+ for (int i = 0; i < nvalues; ++i) {
+ draws_[nvalues * j + i] = draws_[i];
+ }
+ }
+ }
+
+ void ExecuteBound(int nvalues, int repeats) {
+ InitBoundData(nvalues, repeats);
+ CheckRoundtrip();
+ }
+
+ void ExecuteSpacedBound(int nvalues, int repeats, int64_t valid_bits_offset,
+ double null_probability) {
+ InitBoundData(nvalues, repeats);
+
+ int64_t size = num_values_ + valid_bits_offset;
+ auto rand = ::arrow::random::RandomArrayGenerator(1923);
+ const auto array = rand.UInt8(size, 0, 100, null_probability);
+ const auto valid_bits = array->null_bitmap_data();
+ if (valid_bits) {
+ CheckRoundtripSpaced(valid_bits, valid_bits_offset);
+ }
+ }
+
+ void CheckRoundtrip() override {
+ auto encoder =
+ MakeTypedEncoder<Type>(Encoding::DELTA_BINARY_PACKED, false,
descr_.get());
+ auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BINARY_PACKED,
descr_.get());
+
+ encoder->Put(draws_, num_values_);
+ encode_buffer_ = encoder->FlushValues();
+
+ decoder->SetData(num_values_, encode_buffer_->data(),
+ static_cast<int>(encode_buffer_->size()));
+ int values_decoded = decoder->Decode(decode_buf_, num_values_);
+ ASSERT_EQ(num_values_, values_decoded);
+ ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_,
num_values_));
+ }
+
+ void CheckRoundtripSpaced(const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ auto encoder =
+ MakeTypedEncoder<Type>(Encoding::DELTA_BINARY_PACKED, false,
descr_.get());
+ auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BINARY_PACKED,
descr_.get());
+ int null_count = 0;
+ for (auto i = 0; i < num_values_; i++) {
+ if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+ null_count++;
+ }
+ }
+
+ encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
+ encode_buffer_ = encoder->FlushValues();
+ decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
+ static_cast<int>(encode_buffer_->size()));
+ auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_,
null_count,
+ valid_bits, valid_bits_offset);
+ ASSERT_EQ(num_values_, values_decoded);
+ ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_,
num_values_,
+ valid_bits,
valid_bits_offset));
+ }
+
+ protected:
+ USING_BASE_MEMBERS();
+ std::vector<uint8_t> input_bytes_;
+ std::vector<uint8_t> output_bytes_;
+};
+
+using TestDeltaBitPackEncodingTypes = ::testing::Types<Int32Type, Int64Type>;
+TYPED_TEST_SUITE(TestDeltaBitPackEncoding, TestDeltaBitPackEncodingTypes);
+
+TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
+ ASSERT_NO_FATAL_FAILURE(this->Execute(25000, 200));
+ ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0));
+ ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 2000));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+ /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_prob*/
0.1));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteBound(25000, 200));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteBound(0, 0));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteBound(2000, 2000));
Review Comment:
This was due to the non-filled miniblocks not having declared bitwidths.
Changed the writer to write 0s. This shouldn't be an issue [according to
spec](https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5):
> If, in the last block, less than <number of miniblocks in a block>
miniblocks are needed to store the values, the bytes storing the bit widths of
the unneeded miniblocks are still present, their value should be zero, but
readers must accept arbitrary values as well. There are no additional padding
bytes for the miniblock bodies though, as if their bit widths were 0
(regardless of the actual byte values). The reader knows when to stop reading
by keeping track of the number of values read.
Might be a reader issue.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]