pitrou commented on code in PR #50217:
URL: https://github.com/apache/arrow/pull/50217#discussion_r3452904387
##########
cpp/src/arrow/util/bit_util.h:
##########
@@ -176,6 +179,38 @@ static constexpr bool GetBitFromByte(uint8_t byte, uint8_t
i) {
return byte & kBitmask[i];
}
+template <typename Uint>
+struct CopyBitsParams {
+ Uint src = {};
+ Uint dst = {};
+ Uint start = {};
+ Uint end = {};
Review Comment:
Not sure why it's useful to template `start` and `end`.
##########
cpp/src/arrow/util/rle_bitmap_test.cc:
##########
@@ -0,0 +1,598 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/rle_bitmap_internal.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+namespace {
+
+/// Make a vector of `size` pseudo-random bytes, deterministic for a given
`seed`.
+std::vector<uint8_t> MakeRandomBytes(size_t size, uint32_t seed = 56) {
+ std::vector<uint8_t> bytes(size);
+ std::minstd_rand gen(seed);
Review Comment:
Hmm, the `default_random_engine` might give better randomness, though not
sure it's useful here.
##########
cpp/src/arrow/util/rle_bitmap_test.cc:
##########
@@ -0,0 +1,598 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/rle_bitmap_internal.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+namespace {
+
+/// Make a vector of `size` pseudo-random bytes, deterministic for a given
`seed`.
+std::vector<uint8_t> MakeRandomBytes(size_t size, uint32_t seed = 56) {
+ std::vector<uint8_t> bytes(size);
+ std::minstd_rand gen(seed);
+ std::uniform_int_distribution<uint8_t> dist(0, 255);
+ for (auto& byte : bytes) {
+ byte = dist(gen);
+ }
+ return bytes;
+}
+
+/// Read the first `count` bits of `bytes` (LSB first) into a vector of
booleans.
+std::vector<bool> BitsFromBytes(const std::vector<uint8_t>& bytes, rle_size_t
count) {
+ std::vector<bool> bits(count);
+ for (rle_size_t i = 0; i < count; ++i) {
+ bits[i] = bit_util::GetBit(bytes.data(), i);
+ }
+ return bits;
+}
+
+struct CheckDecodedBitsParams {
+ const std::vector<uint8_t>& actual;
+ const std::vector<bool>& expected;
+ rle_size_t count;
+ rle_size_t actual_start_bit = 0;
+ rle_size_t expected_start_idx = 0;
+};
+
+/// Check the decoded output in `out` against `expected`.
+void CheckDecodedBits(const CheckDecodedBitsParams& params) {
+ ARROW_SCOPED_TRACE("out_start_bit = ", params.actual_start_bit,
+ ", expected_start_idx = ", params.expected_start_idx);
+ for (rle_size_t i = 0; i < params.count; ++i) {
+ ASSERT_EQ(bit_util::GetBit(params.actual.data(), params.actual_start_bit +
i),
+ params.expected[params.expected_start_idx + i])
+ << "first difference at bit " << i;
+ }
+}
+
+struct CheckBitsEqualParams {
+ const std::vector<uint8_t>& actual;
+ const std::vector<uint8_t>& expected;
+ rle_size_t count;
+ rle_size_t actual_start_bit = 0;
+ rle_size_t expected_start_bit = 0;
+};
+
+/// Check that two bit ranges, stored in `actual` and `expected`, are equal.
+void CheckBitsEqual(const CheckBitsEqualParams& params) {
+ ARROW_SCOPED_TRACE("actual_start_bit = ", params.actual_start_bit,
+ ", expected_start_bit = ", params.expected_start_bit);
+ for (rle_size_t i = 0; i < params.count; ++i) {
+ ASSERT_EQ(bit_util::GetBit(params.actual.data(), params.actual_start_bit +
i),
+ bit_util::GetBit(params.expected.data(),
params.expected_start_bit + i))
+ << "first difference at bit " << i;
+ }
+}
+
+/// Skip the first `expected_skip` values with Advance(), then decode the rest
of the run
+/// into one output bitmap, `chunk_size` values at a time. Compare against
`expected`.
+///
+/// `chunk_size` controls output bit alignment. When `chunk_size` is not a
multiple of 8,
+/// later calls start at a non-zero output bit offset.
+///
+/// `expected_skip` shifts the decoder's read offset relative to the output
offset.
+/// A non-zero `expected_skip` makes the two differ, which exercises the
bit-unaligned
+/// read path of BitPackedRunToBitmapDecoder. With `expected_skip == 0` they
stay in sync
+/// and only the aligned path runs.
+template <typename Decoder>
+void CheckDecoderValuesChunked(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected,
+ rle_size_t chunk_size = 1, rle_size_t
expected_skip = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", expected_skip = ",
expected_skip);
+
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_LE(expected_skip, n_vals);
+
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(expected_skip);
+ ASSERT_EQ(advanced, expected_skip);
+ const auto n_vals_to_decode = n_vals - expected_skip;
+
+ // Output buffer
+ const auto n_bytes =
static_cast<size_t>(bit_util::BytesForBits(n_vals_to_decode));
+ std::vector<uint8_t> out(n_bytes, 0);
+
+ rle_size_t n_val_read = 0;
+ while (n_val_read < n_vals_to_decode) {
+ const auto want = std::min(chunk_size, n_vals_to_decode - n_val_read);
+ const auto got =
+ decoder.GetBatch(BitmapSpanMut(out.data(), /*bit_start=*/n_val_read),
want);
+ EXPECT_EQ(got, want) << "at pos " << n_val_read;
+ ASSERT_GT(got, 0) << "at pos " << n_val_read; // break on failure
+ n_val_read += got;
+ EXPECT_EQ(decoder.remaining(), n_vals_to_decode - n_val_read);
+ }
+
+ EXPECT_EQ(decoder.remaining(), 0);
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = n_vals_to_decode,
+ .actual_start_bit = 0,
+ .expected_start_idx = expected_skip,
+ });
+}
+
+/// Decode a chunk of data into a known output to check for out of bounds
write.
+///
+/// @see CheckDecoderValuesChunked
+template <typename Decoder>
+void CheckDecoderClobber(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected, rle_size_t
chunk_size = 1,
+ rle_size_t expected_skip = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", expected_skip = ",
expected_skip);
+
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_LE(expected_skip, n_vals);
+
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(expected_skip);
+ ASSERT_EQ(advanced, expected_skip);
+ const auto n_vals_to_decode = n_vals - expected_skip;
+
+ // Output buffer with enough capacity to store a full chunk plus extra bytes
as
+ // clobbers/guard to check for out of bounds write.
+ const auto n_bytes = static_cast<size_t>(bit_util::BytesForBits(chunk_size) +
+ bit_util::CeilDiv(n_vals,
chunk_size) + 2);
+ // This seed is arbitrary and of little importance. We are simply trying to
avoid an
+ // unlikely case where guards have the same pattern in all invocations.
+ const auto out_pattern =
+ MakeRandomBytes(n_bytes, /* seed= */ (chunk_size << 16) ^ expected_skip);
+ auto out = out_pattern;
+
+ rle_size_t n_val_read = 0;
+ rle_size_t out_bit_start = 0;
+ while (n_val_read < n_vals_to_decode) {
+ // Clean output buffer
+ out = out_pattern;
+ const auto want = std::min(chunk_size, n_vals_to_decode - n_val_read);
+ const auto got = decoder.GetBatch(BitmapSpanMut(out.data(),
out_bit_start), want);
+ ASSERT_GT(got, 0) << "at pos " << n_val_read; // break on failure
+ EXPECT_EQ(got, want) << "at pos " << n_val_read;
+ // Check that the leading bits have not been modified
+ CheckBitsEqual({.actual = out, .expected = out_pattern, .count =
out_bit_start});
+ // Check that the trailing bits have not been modified
+ CheckBitsEqual({
+ .actual = out,
+ .expected = out_pattern,
+ .count = static_cast<rle_size_t>(8 * n_bytes) - (out_bit_start + want),
+ .actual_start_bit = out_bit_start + want,
+ .expected_start_bit = out_bit_start + want,
+ });
+ // Check decoded bits are also correct
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = want,
+ .actual_start_bit = out_bit_start,
+ .expected_start_idx = expected_skip + n_val_read,
+ });
+
+ n_val_read += got;
+ ++out_bit_start;
+ EXPECT_EQ(decoder.remaining(), n_vals_to_decode - n_val_read);
+ }
+}
+
+/// All the checks shared by both decoder types.
+///
+/// `expected` is the full sequence of booleans the run should decode to.
+template <typename Decoder>
+void CheckBitmapDecoder(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected) {
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+
+ // remaining() reflects the run size before any value is read.
+ {
+ Decoder decoder(run);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ }
+
+ // Empty requests are a no-op.
+ {
+ Decoder decoder(run);
+ uint8_t out = 0;
+ const auto got = decoder.GetBatch(BitmapSpanMut(&out), /*batch_size=*/0);
+ EXPECT_EQ(got, 0);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ }
+
+ // Decode the whole run in several chunks.
+ for (const rle_size_t chunk_size : {rle_size_t{1}, rle_size_t{3},
rle_size_t{7},
+ rle_size_t{8}, rle_size_t{9}, n_vals,
n_vals + 1}) {
+ CheckDecoderValuesChunked<Decoder>(run, expected, chunk_size);
+ }
+
+ // Decode the whole run in several chunks, after an initial Advance that
shifts
+ // the run and output bit alignment.
+ for (const rle_size_t chunk_size : {rle_size_t{1}, rle_size_t{3},
rle_size_t{7},
+ rle_size_t{8}, rle_size_t{9}, n_vals,
n_vals + 1}) {
+ for (rle_size_t expected_skip = 1; expected_skip < 8 && expected_skip <
n_vals;
+ ++expected_skip) {
+ // Check the decoding happens as expected
+ CheckDecoderValuesChunked<Decoder>(run, expected, chunk_size,
expected_skip);
+ // Check the decoding does not write out of bounds
+ CheckDecoderClobber<Decoder>(run, expected, chunk_size, expected_skip);
+ }
+ }
+
+ // Get() one value at a time, then read past the end.
+ {
+ Decoder decoder(run);
+ std::vector<uint8_t>
out(static_cast<size_t>(bit_util::BytesForBits(n_vals)) + 1, 0);
+ for (rle_size_t i = 0; i < n_vals; ++i) {
+ const bool ok = decoder.Get(BitmapSpanMut(out.data(), /*bit_start=*/i));
+ EXPECT_TRUE(ok);
+ EXPECT_EQ(decoder.remaining(), n_vals - i - 1);
+ }
+ // Exhausted: nothing more can be read or advanced.
+ const bool ok = decoder.Get(BitmapSpanMut(out.data()));
+ EXPECT_FALSE(ok);
+ const auto advanced = decoder.Advance(1);
+ EXPECT_EQ(advanced, 0);
+ EXPECT_EQ(decoder.remaining(), 0);
+ CheckDecodedBits({.actual = out, .expected = expected, .count = n_vals});
+ }
+
+ // Advancing more than available stops at the run boundary.
+ {
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(n_vals + 100);
+ EXPECT_EQ(advanced, n_vals);
+ EXPECT_EQ(decoder.remaining(), 0);
+ }
+
+ // Reset() rewinds the decoder so the run can be decoded again.
+ {
+ Decoder decoder(run);
+ std::vector<uint8_t>
out_1(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+ const auto scratch_got = decoder.GetBatch(BitmapSpanMut(out_1.data()),
n_vals);
+ EXPECT_EQ(scratch_got, n_vals);
+ EXPECT_EQ(decoder.remaining(), 0);
+
+ decoder.Reset(run);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ std::vector<uint8_t>
out_2(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+ const auto got = decoder.GetBatch(BitmapSpanMut(out_2.data()), n_vals);
+ EXPECT_EQ(got, n_vals);
+ CheckDecodedBits({.actual = out_2, .expected = expected, .count = n_vals});
+ }
+}
+
+} // namespace
+
+/***************************
+ * RleRunToBitmapDecoder *
+ ***************************/
+
+class RleRunToBitmapDecoderTest : public ::testing::TestWithParam<rle_size_t>
{};
+
+TEST_P(RleRunToBitmapDecoderTest, Decode) {
+ const auto& count = GetParam();
+
+ // Only two possible repeated value
+ for (bool value : {true, false}) {
+ ARROW_SCOPED_TRACE("value = ", value);
+
+ // A boolean RLE run stores its value in a single (1-bit-wide) byte.
+ const uint8_t data = value ? 1 : 0;
+ const auto run = RleRun(&data, count, /*value_bit_width=*/1);
+
+ // value() reports the repeated boolean.
+ RleRunToBitmapDecoder decoder(run);
+ EXPECT_EQ(decoder.value(), value);
+
+ const std::vector<bool> expected(count, value);
+ CheckBitmapDecoder<RleRunToBitmapDecoder>(run, expected);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P( //
+ RleBitmap, RleRunToBitmapDecoderTest,
+ ::testing::Values(0, 1, 3, 8, 9, 13, 64, 100, 100, 1000));
Review Comment:
Also with `1000` the test is very long:
```
[ RUN ] RleBitmap/RleRunToBitmapDecoderTest.Decode/8
[ OK ] RleBitmap/RleRunToBitmapDecoderTest.Decode/8 (28 ms)
[ RUN ] RleBitmap/RleRunToBitmapDecoderTest.Decode/9
[ OK ] RleBitmap/RleRunToBitmapDecoderTest.Decode/9 (2824 ms)
[----------] 10 tests from RleBitmap/RleRunToBitmapDecoderTest (2899 ms
total)
```
##########
cpp/src/arrow/util/rle_bitmap_test.cc:
##########
@@ -0,0 +1,598 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/rle_bitmap_internal.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+namespace {
+
+/// Make a vector of `size` pseudo-random bytes, deterministic for a given
`seed`.
+std::vector<uint8_t> MakeRandomBytes(size_t size, uint32_t seed = 56) {
+ std::vector<uint8_t> bytes(size);
+ std::minstd_rand gen(seed);
+ std::uniform_int_distribution<uint8_t> dist(0, 255);
+ for (auto& byte : bytes) {
+ byte = dist(gen);
+ }
+ return bytes;
+}
+
+/// Read the first `count` bits of `bytes` (LSB first) into a vector of
booleans.
+std::vector<bool> BitsFromBytes(const std::vector<uint8_t>& bytes, rle_size_t
count) {
+ std::vector<bool> bits(count);
+ for (rle_size_t i = 0; i < count; ++i) {
+ bits[i] = bit_util::GetBit(bytes.data(), i);
+ }
+ return bits;
+}
+
+struct CheckDecodedBitsParams {
+ const std::vector<uint8_t>& actual;
+ const std::vector<bool>& expected;
+ rle_size_t count;
+ rle_size_t actual_start_bit = 0;
+ rle_size_t expected_start_idx = 0;
+};
+
+/// Check the decoded output in `out` against `expected`.
+void CheckDecodedBits(const CheckDecodedBitsParams& params) {
+ ARROW_SCOPED_TRACE("out_start_bit = ", params.actual_start_bit,
+ ", expected_start_idx = ", params.expected_start_idx);
+ for (rle_size_t i = 0; i < params.count; ++i) {
+ ASSERT_EQ(bit_util::GetBit(params.actual.data(), params.actual_start_bit +
i),
+ params.expected[params.expected_start_idx + i])
+ << "first difference at bit " << i;
+ }
+}
+
+struct CheckBitsEqualParams {
+ const std::vector<uint8_t>& actual;
+ const std::vector<uint8_t>& expected;
+ rle_size_t count;
+ rle_size_t actual_start_bit = 0;
+ rle_size_t expected_start_bit = 0;
+};
+
+/// Check that two bit ranges, stored in `actual` and `expected`, are equal.
+void CheckBitsEqual(const CheckBitsEqualParams& params) {
+ ARROW_SCOPED_TRACE("actual_start_bit = ", params.actual_start_bit,
+ ", expected_start_bit = ", params.expected_start_bit);
+ for (rle_size_t i = 0; i < params.count; ++i) {
+ ASSERT_EQ(bit_util::GetBit(params.actual.data(), params.actual_start_bit +
i),
+ bit_util::GetBit(params.expected.data(),
params.expected_start_bit + i))
+ << "first difference at bit " << i;
+ }
+}
+
+/// Skip the first `expected_skip` values with Advance(), then decode the rest
of the run
+/// into one output bitmap, `chunk_size` values at a time. Compare against
`expected`.
+///
+/// `chunk_size` controls output bit alignment. When `chunk_size` is not a
multiple of 8,
+/// later calls start at a non-zero output bit offset.
+///
+/// `expected_skip` shifts the decoder's read offset relative to the output
offset.
+/// A non-zero `expected_skip` makes the two differ, which exercises the
bit-unaligned
+/// read path of BitPackedRunToBitmapDecoder. With `expected_skip == 0` they
stay in sync
+/// and only the aligned path runs.
+template <typename Decoder>
+void CheckDecoderValuesChunked(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected,
+ rle_size_t chunk_size = 1, rle_size_t
expected_skip = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", expected_skip = ",
expected_skip);
+
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_LE(expected_skip, n_vals);
+
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(expected_skip);
+ ASSERT_EQ(advanced, expected_skip);
+ const auto n_vals_to_decode = n_vals - expected_skip;
+
+ // Output buffer
+ const auto n_bytes =
static_cast<size_t>(bit_util::BytesForBits(n_vals_to_decode));
+ std::vector<uint8_t> out(n_bytes, 0);
+
+ rle_size_t n_val_read = 0;
+ while (n_val_read < n_vals_to_decode) {
+ const auto want = std::min(chunk_size, n_vals_to_decode - n_val_read);
+ const auto got =
+ decoder.GetBatch(BitmapSpanMut(out.data(), /*bit_start=*/n_val_read),
want);
+ EXPECT_EQ(got, want) << "at pos " << n_val_read;
+ ASSERT_GT(got, 0) << "at pos " << n_val_read; // break on failure
+ n_val_read += got;
+ EXPECT_EQ(decoder.remaining(), n_vals_to_decode - n_val_read);
+ }
+
+ EXPECT_EQ(decoder.remaining(), 0);
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = n_vals_to_decode,
+ .actual_start_bit = 0,
+ .expected_start_idx = expected_skip,
+ });
+}
+
+/// Decode a chunk of data into a known output to check for out of bounds
write.
+///
+/// @see CheckDecoderValuesChunked
+template <typename Decoder>
+void CheckDecoderClobber(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected, rle_size_t
chunk_size = 1,
+ rle_size_t expected_skip = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", expected_skip = ",
expected_skip);
+
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_LE(expected_skip, n_vals);
+
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(expected_skip);
+ ASSERT_EQ(advanced, expected_skip);
+ const auto n_vals_to_decode = n_vals - expected_skip;
+
+ // Output buffer with enough capacity to store a full chunk plus extra bytes
as
+ // clobbers/guard to check for out of bounds write.
+ const auto n_bytes = static_cast<size_t>(bit_util::BytesForBits(chunk_size) +
+ bit_util::CeilDiv(n_vals,
chunk_size) + 2);
+ // This seed is arbitrary and of little importance. We are simply trying to
avoid an
+ // unlikely case where guards have the same pattern in all invocations.
+ const auto out_pattern =
+ MakeRandomBytes(n_bytes, /* seed= */ (chunk_size << 16) ^ expected_skip);
+ auto out = out_pattern;
+
+ rle_size_t n_val_read = 0;
+ rle_size_t out_bit_start = 0;
+ while (n_val_read < n_vals_to_decode) {
+ // Clean output buffer
+ out = out_pattern;
+ const auto want = std::min(chunk_size, n_vals_to_decode - n_val_read);
+ const auto got = decoder.GetBatch(BitmapSpanMut(out.data(),
out_bit_start), want);
+ ASSERT_GT(got, 0) << "at pos " << n_val_read; // break on failure
+ EXPECT_EQ(got, want) << "at pos " << n_val_read;
+ // Check that the leading bits have not been modified
+ CheckBitsEqual({.actual = out, .expected = out_pattern, .count =
out_bit_start});
+ // Check that the trailing bits have not been modified
+ CheckBitsEqual({
+ .actual = out,
+ .expected = out_pattern,
+ .count = static_cast<rle_size_t>(8 * n_bytes) - (out_bit_start + want),
+ .actual_start_bit = out_bit_start + want,
+ .expected_start_bit = out_bit_start + want,
+ });
+ // Check decoded bits are also correct
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = want,
+ .actual_start_bit = out_bit_start,
+ .expected_start_idx = expected_skip + n_val_read,
+ });
+
+ n_val_read += got;
+ ++out_bit_start;
+ EXPECT_EQ(decoder.remaining(), n_vals_to_decode - n_val_read);
+ }
+}
+
+/// All the checks shared by both decoder types.
+///
+/// `expected` is the full sequence of booleans the run should decode to.
+template <typename Decoder>
+void CheckBitmapDecoder(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected) {
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+
+ // remaining() reflects the run size before any value is read.
+ {
+ Decoder decoder(run);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ }
+
+ // Empty requests are a no-op.
+ {
+ Decoder decoder(run);
+ uint8_t out = 0;
+ const auto got = decoder.GetBatch(BitmapSpanMut(&out), /*batch_size=*/0);
+ EXPECT_EQ(got, 0);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ }
+
+ // Decode the whole run in several chunks.
+ for (const rle_size_t chunk_size : {rle_size_t{1}, rle_size_t{3},
rle_size_t{7},
+ rle_size_t{8}, rle_size_t{9}, n_vals,
n_vals + 1}) {
+ CheckDecoderValuesChunked<Decoder>(run, expected, chunk_size);
+ }
+
+ // Decode the whole run in several chunks, after an initial Advance that
shifts
+ // the run and output bit alignment.
+ for (const rle_size_t chunk_size : {rle_size_t{1}, rle_size_t{3},
rle_size_t{7},
+ rle_size_t{8}, rle_size_t{9}, n_vals,
n_vals + 1}) {
+ for (rle_size_t expected_skip = 1; expected_skip < 8 && expected_skip <
n_vals;
+ ++expected_skip) {
+ // Check the decoding happens as expected
+ CheckDecoderValuesChunked<Decoder>(run, expected, chunk_size,
expected_skip);
+ // Check the decoding does not write out of bounds
+ CheckDecoderClobber<Decoder>(run, expected, chunk_size, expected_skip);
+ }
+ }
+
+ // Get() one value at a time, then read past the end.
+ {
+ Decoder decoder(run);
+ std::vector<uint8_t>
out(static_cast<size_t>(bit_util::BytesForBits(n_vals)) + 1, 0);
+ for (rle_size_t i = 0; i < n_vals; ++i) {
+ const bool ok = decoder.Get(BitmapSpanMut(out.data(), /*bit_start=*/i));
+ EXPECT_TRUE(ok);
+ EXPECT_EQ(decoder.remaining(), n_vals - i - 1);
+ }
+ // Exhausted: nothing more can be read or advanced.
+ const bool ok = decoder.Get(BitmapSpanMut(out.data()));
+ EXPECT_FALSE(ok);
+ const auto advanced = decoder.Advance(1);
+ EXPECT_EQ(advanced, 0);
+ EXPECT_EQ(decoder.remaining(), 0);
+ CheckDecodedBits({.actual = out, .expected = expected, .count = n_vals});
+ }
+
+ // Advancing more than available stops at the run boundary.
+ {
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(n_vals + 100);
+ EXPECT_EQ(advanced, n_vals);
+ EXPECT_EQ(decoder.remaining(), 0);
+ }
+
+ // Reset() rewinds the decoder so the run can be decoded again.
+ {
+ Decoder decoder(run);
+ std::vector<uint8_t>
out_1(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+ const auto scratch_got = decoder.GetBatch(BitmapSpanMut(out_1.data()),
n_vals);
+ EXPECT_EQ(scratch_got, n_vals);
+ EXPECT_EQ(decoder.remaining(), 0);
+
+ decoder.Reset(run);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ std::vector<uint8_t>
out_2(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+ const auto got = decoder.GetBatch(BitmapSpanMut(out_2.data()), n_vals);
+ EXPECT_EQ(got, n_vals);
+ CheckDecodedBits({.actual = out_2, .expected = expected, .count = n_vals});
+ }
+}
+
+} // namespace
+
+/***************************
+ * RleRunToBitmapDecoder *
+ ***************************/
+
+class RleRunToBitmapDecoderTest : public ::testing::TestWithParam<rle_size_t>
{};
+
+TEST_P(RleRunToBitmapDecoderTest, Decode) {
+ const auto& count = GetParam();
+
+ // Only two possible repeated value
+ for (bool value : {true, false}) {
+ ARROW_SCOPED_TRACE("value = ", value);
+
+ // A boolean RLE run stores its value in a single (1-bit-wide) byte.
+ const uint8_t data = value ? 1 : 0;
+ const auto run = RleRun(&data, count, /*value_bit_width=*/1);
+
+ // value() reports the repeated boolean.
+ RleRunToBitmapDecoder decoder(run);
+ EXPECT_EQ(decoder.value(), value);
+
+ const std::vector<bool> expected(count, value);
+ CheckBitmapDecoder<RleRunToBitmapDecoder>(run, expected);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P( //
+ RleBitmap, RleRunToBitmapDecoderTest,
+ ::testing::Values(0, 1, 3, 8, 9, 13, 64, 100, 100, 1000));
+
+/*********************************
+ * BitPackedRunToBitmapDecoder *
+ *********************************/
+
+struct BitPackedBitmapCase {
+ std::string name;
+ // The raw bit-packed bytes (LSB first). Must hold at least `count` bits.
+ std::vector<uint8_t> bytes;
+ // The number of values in the run.
+ rle_size_t count;
+};
+
+class BitPackedRunToBitmapDecoderTest
+ : public ::testing::TestWithParam<BitPackedBitmapCase> {};
+
+TEST_P(BitPackedRunToBitmapDecoderTest, Decode) {
+ const auto& param = GetParam();
+ ASSERT_GE(param.bytes.size(),
static_cast<size_t>(bit_util::BytesForBits(param.count)));
+
+ const auto run = BitPackedRun(param.bytes.data(), param.count,
/*value_bit_width=*/1,
+ /*max_read_bytes=*/-1);
+
+ const std::vector<bool> expected = BitsFromBytes(param.bytes, param.count);
+ CheckBitmapDecoder<BitPackedRunToBitmapDecoder>(run, expected);
+}
+
+INSTANTIATE_TEST_SUITE_P( //
+ RleBitmap, BitPackedRunToBitmapDecoderTest,
+ ::testing::Values( //
+ BitPackedBitmapCase{.name = "empty", .bytes = {0b10110010}, .count =
0},
+ BitPackedBitmapCase{.name = "single", .bytes = {0b00000001}, .count =
1},
+ BitPackedBitmapCase{.name = "three", .bytes = {0b00000101}, .count =
3},
+ BitPackedBitmapCase{.name = "eight", .bytes = {0b11010010}, .count =
8},
+ BitPackedBitmapCase{
+ .name = "alternating", .bytes = {0b10101010, 0b10101010}, .count =
13},
+ BitPackedBitmapCase{.name = "all_zeros", .bytes = {0x00, 0x00}, .count
= 16},
+ BitPackedBitmapCase{.name = "all_ones", .bytes = {0xFF, 0xFF}, .count
= 16},
+ BitPackedBitmapCase{
+ .name = "mixed", .bytes = {0b11001010, 0b00001111, 0b10110001},
.count = 24},
+ BitPackedBitmapCase{
+ .name = "unaligned_count", .bytes = {0b00110101, 0b11100100},
.count = 11},
+ BitPackedBitmapCase{
+ .name = "large",
+ .bytes = std::vector<uint8_t>(16, 0b01101001),
+ .count = 128,
+ }),
+ [](const ::testing::TestParamInfo<BitPackedBitmapCase>& info) {
+ return info.param.name;
+ });
+
+/*********************************
+ * RleBitPackedToBitmapDecoder *
+ *********************************/
+
+namespace {
+
+/// Append the LEB128 (unsigned, little-endian base-128) encoding of `value`.
+void AppendLeb128(std::vector<uint8_t>& out, uint32_t value) {
+ std::array<uint8_t, bit_util::kMaxLEB128ByteLenFor<uint32_t>> buf;
+ const auto n_bytes =
+ bit_util::WriteLEB128(value, buf.data(),
static_cast<int32_t>(buf.size()));
+ ASSERT_GT(n_bytes, 0);
+ out.insert(out.end(), buf.data(), buf.data() + n_bytes);
+}
+
+void AppendRleRun(std::vector<uint8_t>& bytes, std::vector<bool>& expected,
bool value,
+ rle_size_t count) {
+ AppendLeb128(bytes, static_cast<uint32_t>(count) << 1); // low bit 0 => RLE
+ bytes.push_back(value ? 1 : 0);
+ expected.insert(expected.end(), count, value);
+}
+
+void AppendBitPackedRun(std::vector<uint8_t>& bytes, std::vector<bool>&
expected,
+ const std::vector<uint8_t>& packed) {
+ const auto groups = static_cast<rle_size_t>(packed.size());
+ AppendLeb128(bytes, (static_cast<uint32_t>(groups) << 1) | 1); // low bit 1
=> packed
+ bytes.insert(bytes.end(), packed.begin(), packed.end());
+ for (rle_size_t i = 0; i < groups * 8; ++i) {
+ expected.push_back(bit_util::GetBit(packed.data(), i));
+ }
+}
+
+/// Decode the whole `bytes` into a bitmap and check it against `expected`.
+///
+/// Decode `chunk_size` values per GetBatch call to check the decoder state
between
+/// calls. The output starts at bit offset `out_offset`. A non-zero offset
makes
+/// the output and the encoded `bytes` use different bit alignment.
+void CheckRleBitPackedDecode(const std::vector<uint8_t>& bytes,
+ const std::vector<bool>& expected, rle_size_t
chunk_size,
+ rle_size_t out_offset = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", out_offset = ",
out_offset);
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+
+ RleBitPackedToBitmapDecoder decoder(bytes.data(),
+ static_cast<rle_size_t>(bytes.size()));
+ EXPECT_EQ(decoder.exhausted(), n_vals == 0);
+
+ // Output buffer with one guard byte to catch out-of-bounds writes.
+ std::vector<uint8_t> out(
+ static_cast<size_t>(bit_util::BytesForBits(out_offset + n_vals)) + 1, 0);
+ const uint8_t guard = 0xA5;
+ out.back() = guard;
+
+ rle_size_t read = 0;
+ while (read < n_vals) {
+ const auto want = std::min(chunk_size, n_vals - read);
+ const auto got = decoder.GetBatch(
+ BitmapSpanMut(out.data(), /*bit_start=*/out_offset + read), want);
+ EXPECT_EQ(got, want) << "at pos " << read;
+ ASSERT_GT(got, 0) << "at pos " << read; // break on failure
+ read += got;
+ }
+
+ EXPECT_EQ(read, n_vals);
+ EXPECT_TRUE(decoder.exhausted());
+ // Reading past the end yields nothing and leaves the decoder exhausted.
+ uint8_t scratch = 0;
+ const auto past_end = decoder.GetBatch(BitmapSpanMut(&scratch), 8);
+ EXPECT_EQ(past_end, 0);
+ EXPECT_TRUE(decoder.exhausted());
+
+ EXPECT_EQ(out.back(), guard) << "decoder wrote past the end of the output";
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = n_vals,
+ .actual_start_bit = out_offset,
+ });
+}
+
+/// Run the decode check over a battery of chunk sizes and output offsets.
+void CheckRleBitPackedToBitmap(const std::vector<uint8_t>& bytes,
+ const std::vector<bool>& expected) {
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_GT(n_vals, 0);
+ for (const rle_size_t chunk_size :
+ {rle_size_t{1}, rle_size_t{3}, rle_size_t{7}, rle_size_t{8},
rle_size_t{9},
+ rle_size_t{33}, n_vals}) {
Review Comment:
Test `n_vals + 1` as well?
##########
cpp/src/arrow/util/rle_bitmap_internal.h:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+/// A lightweight view over a bitmap.
+template <typename B = uint8_t>
+class BitmapSpan {
+ public:
+ using size_type = rle_size_t;
+ using byte_type = B;
+
+ explicit constexpr BitmapSpan(byte_type* data, size_type bit_start = 0)
noexcept
+ : data_{data}, bit_start_{bit_start} {
+ Normalize();
+ }
+
+ /// Pointer to the byte where the first value is stored.
+ constexpr byte_type* data() const noexcept { return data_; }
+
+ /// Bit offset of the first value in the first byte.
+ constexpr size_type bit_start() const noexcept { return bit_start_; }
+
+ /// Return a new span starting at the given position.
+ constexpr BitmapSpan NewStartingAt(size_type bit_start) const noexcept {
+ auto out = *this;
+ out.bit_start_ += bit_start;
+ out.Normalize();
+ return out;
+ }
+
+ private:
+ byte_type* data_;
+ size_type bit_start_ = 0;
+
+ /// Ensure `bit_start` always ends up in [0, 8[, even when it starts
negative.
+ constexpr void Normalize() {
+ // On two's-complement targets an arithmetic right shift floors, and the
AND mask
+ // yields the non-negative remainder (e.g. -1 -> byte -1, offset 7),
unlike `/` and
+ // `%` which truncate toward zero.
+ data_ += bit_start_ >> 3;
+ bit_start_ &= 0b0111;
+ }
+};
+
+using BitmapSpanMut = BitmapSpan<uint8_t>;
+using BitmapSpanConst = BitmapSpan<const uint8_t>;
+
+class RleRunToBitmapDecoder {
+ public:
+ /// The type of run that can be decoded.
+ using RunType = RleRun;
+
+ constexpr RleRunToBitmapDecoder() noexcept = default;
+
+ explicit RleRunToBitmapDecoder(const RunType& run) noexcept { Reset(run); }
+
+ void Reset(const RunType& run) noexcept {
+ values_left_ = run.values_count();
+ if (run.value_little_endian() == 0) {
+ value_pattern_ = uint8_t{0};
+ } else {
+ value_pattern_ = uint8_t{0xFF};
+ }
+ }
+
+ /// Return the number of values that can be advanced.
+ rle_size_t remaining() const { return values_left_; }
+
+ /// Return the repeated value of this decoder.
+ constexpr bool value() const { return value_pattern_ != 0; }
+
+ /// Return how much the decoder would advance if asked to.
+ ///
+ /// Does not modify input.
+ rle_size_t AdvanceCapacity(rle_size_t batch_size) const noexcept {
+ const auto n_vals = std::min(batch_size, remaining());
+ return n_vals;
+ }
+
+ /// Advance by as many values as provided or until exhaustion of the decoder.
+ /// Return the number of values skipped.
+ [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
+ const auto n_vals = AdvanceCapacity(batch_size);
+ values_left_ -= n_vals;
+ ARROW_DCHECK_GE(remaining(), 0);
+ return n_vals;
+ }
+
+ /// Get the next value and return false if there are no more.
+ [[nodiscard]] bool Get(BitmapSpanMut out) { return GetBatch(out, 1) == 1; }
+
+ /// Get a batch of values return the number of decoded elements.
+ ///
+ /// May write fewer elements to the output than requested if there are not
+ /// enough values left.
+ [[nodiscard]] rle_size_t GetBatch(BitmapSpanMut out, rle_size_t batch_size) {
+ const auto out_bit_offset = out.bit_start();
+ ARROW_DCHECK_GE(out_bit_offset, 0);
+ ARROW_DCHECK_LT(out_bit_offset, 8);
+
+ if (ARROW_PREDICT_FALSE(remaining() == 0 || batch_size == 0)) {
+ return 0;
+ }
+
+ rle_size_t n_vals = 0;
+
+ // HEADER: Writing inside the first byte if caller gives a non-aligned
input
+ if (out_bit_offset != 0) {
+ n_vals = GetBatchInByte(out, batch_size);
+ }
+
+ // Writing full bytes
+ n_vals += GetBatchFullBytes(out.NewStartingAt(n_vals), batch_size -
n_vals);
+
+ // TRAILER: Writing inside the last byte if caller asked for non multiple
of 8 values
+ const auto n_last_vals = std::min(batch_size - n_vals, remaining());
+ if (ARROW_PREDICT_FALSE(n_last_vals > 0)) {
Review Comment:
Why the false prediction here? This is going to be likely, isn't it?
##########
cpp/src/arrow/util/rle_bitmap_internal.h:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+/// A lightweight view over a bitmap.
+template <typename B = uint8_t>
+class BitmapSpan {
+ public:
+ using size_type = rle_size_t;
+ using byte_type = B;
+
+ explicit constexpr BitmapSpan(byte_type* data, size_type bit_start = 0)
noexcept
+ : data_{data}, bit_start_{bit_start} {
+ Normalize();
+ }
+
+ /// Pointer to the byte where the first value is stored.
+ constexpr byte_type* data() const noexcept { return data_; }
+
+ /// Bit offset of the first value in the first byte.
+ constexpr size_type bit_start() const noexcept { return bit_start_; }
+
+ /// Return a new span starting at the given position.
+ constexpr BitmapSpan NewStartingAt(size_type bit_start) const noexcept {
+ auto out = *this;
+ out.bit_start_ += bit_start;
+ out.Normalize();
+ return out;
+ }
+
+ private:
+ byte_type* data_;
+ size_type bit_start_ = 0;
+
+ /// Ensure `bit_start` always ends up in [0, 8[, even when it starts
negative.
+ constexpr void Normalize() {
+ // On two's-complement targets an arithmetic right shift floors, and the
AND mask
+ // yields the non-negative remainder (e.g. -1 -> byte -1, offset 7),
unlike `/` and
+ // `%` which truncate toward zero.
+ data_ += bit_start_ >> 3;
+ bit_start_ &= 0b0111;
+ }
+};
+
+using BitmapSpanMut = BitmapSpan<uint8_t>;
+using BitmapSpanConst = BitmapSpan<const uint8_t>;
+
+class RleRunToBitmapDecoder {
+ public:
+ /// The type of run that can be decoded.
+ using RunType = RleRun;
+
+ constexpr RleRunToBitmapDecoder() noexcept = default;
+
+ explicit RleRunToBitmapDecoder(const RunType& run) noexcept { Reset(run); }
+
+ void Reset(const RunType& run) noexcept {
+ values_left_ = run.values_count();
+ if (run.value_little_endian() == 0) {
+ value_pattern_ = uint8_t{0};
+ } else {
+ value_pattern_ = uint8_t{0xFF};
+ }
+ }
+
+ /// Return the number of values that can be advanced.
+ rle_size_t remaining() const { return values_left_; }
+
+ /// Return the repeated value of this decoder.
+ constexpr bool value() const { return value_pattern_ != 0; }
+
+ /// Return how much the decoder would advance if asked to.
+ ///
+ /// Does not modify input.
+ rle_size_t AdvanceCapacity(rle_size_t batch_size) const noexcept {
Review Comment:
Nit: `GetAdvanceCapacity`? (we're not advancing the capacity, whatever that
means :-))
##########
cpp/src/arrow/util/rle_bitmap_internal.h:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+/// A lightweight view over a bitmap.
+template <typename B = uint8_t>
+class BitmapSpan {
+ public:
+ using size_type = rle_size_t;
+ using byte_type = B;
+
+ explicit constexpr BitmapSpan(byte_type* data, size_type bit_start = 0)
noexcept
+ : data_{data}, bit_start_{bit_start} {
+ Normalize();
+ }
+
+ /// Pointer to the byte where the first value is stored.
+ constexpr byte_type* data() const noexcept { return data_; }
+
+ /// Bit offset of the first value in the first byte.
+ constexpr size_type bit_start() const noexcept { return bit_start_; }
+
+ /// Return a new span starting at the given position.
+ constexpr BitmapSpan NewStartingAt(size_type bit_start) const noexcept {
+ auto out = *this;
+ out.bit_start_ += bit_start;
+ out.Normalize();
+ return out;
+ }
+
+ private:
+ byte_type* data_;
+ size_type bit_start_ = 0;
+
+ /// Ensure `bit_start` always ends up in [0, 8[, even when it starts
negative.
+ constexpr void Normalize() {
+ // On two's-complement targets an arithmetic right shift floors, and the
AND mask
+ // yields the non-negative remainder (e.g. -1 -> byte -1, offset 7),
unlike `/` and
+ // `%` which truncate toward zero.
+ data_ += bit_start_ >> 3;
+ bit_start_ &= 0b0111;
+ }
+};
+
+using BitmapSpanMut = BitmapSpan<uint8_t>;
+using BitmapSpanConst = BitmapSpan<const uint8_t>;
+
+class RleRunToBitmapDecoder {
+ public:
+ /// The type of run that can be decoded.
+ using RunType = RleRun;
+
+ constexpr RleRunToBitmapDecoder() noexcept = default;
+
+ explicit RleRunToBitmapDecoder(const RunType& run) noexcept { Reset(run); }
+
+ void Reset(const RunType& run) noexcept {
+ values_left_ = run.values_count();
+ if (run.value_little_endian() == 0) {
+ value_pattern_ = uint8_t{0};
+ } else {
+ value_pattern_ = uint8_t{0xFF};
+ }
+ }
+
+ /// Return the number of values that can be advanced.
+ rle_size_t remaining() const { return values_left_; }
+
+ /// Return the repeated value of this decoder.
+ constexpr bool value() const { return value_pattern_ != 0; }
+
+ /// Return how much the decoder would advance if asked to.
+ ///
+ /// Does not modify input.
+ rle_size_t AdvanceCapacity(rle_size_t batch_size) const noexcept {
+ const auto n_vals = std::min(batch_size, remaining());
+ return n_vals;
+ }
+
+ /// Advance by as many values as provided or until exhaustion of the decoder.
+ /// Return the number of values skipped.
+ [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
+ const auto n_vals = AdvanceCapacity(batch_size);
+ values_left_ -= n_vals;
+ ARROW_DCHECK_GE(remaining(), 0);
+ return n_vals;
+ }
+
+ /// Get the next value and return false if there are no more.
Review Comment:
```suggestion
/// Read the next value into `out` and return false if there are no more.
```
##########
cpp/src/arrow/util/rle_bitmap_internal.h:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+/// A lightweight view over a bitmap.
+template <typename B = uint8_t>
+class BitmapSpan {
+ public:
+ using size_type = rle_size_t;
+ using byte_type = B;
+
+ explicit constexpr BitmapSpan(byte_type* data, size_type bit_start = 0)
noexcept
+ : data_{data}, bit_start_{bit_start} {
+ Normalize();
+ }
+
+ /// Pointer to the byte where the first value is stored.
+ constexpr byte_type* data() const noexcept { return data_; }
+
+ /// Bit offset of the first value in the first byte.
+ constexpr size_type bit_start() const noexcept { return bit_start_; }
+
+ /// Return a new span starting at the given position.
+ constexpr BitmapSpan NewStartingAt(size_type bit_start) const noexcept {
+ auto out = *this;
+ out.bit_start_ += bit_start;
+ out.Normalize();
+ return out;
+ }
+
+ private:
+ byte_type* data_;
+ size_type bit_start_ = 0;
+
+ /// Ensure `bit_start` always ends up in [0, 8[, even when it starts
negative.
+ constexpr void Normalize() {
+ // On two's-complement targets an arithmetic right shift floors, and the
AND mask
+ // yields the non-negative remainder (e.g. -1 -> byte -1, offset 7),
unlike `/` and
+ // `%` which truncate toward zero.
+ data_ += bit_start_ >> 3;
+ bit_start_ &= 0b0111;
+ }
+};
+
+using BitmapSpanMut = BitmapSpan<uint8_t>;
+using BitmapSpanConst = BitmapSpan<const uint8_t>;
+
+class RleRunToBitmapDecoder {
+ public:
+ /// The type of run that can be decoded.
+ using RunType = RleRun;
+
+ constexpr RleRunToBitmapDecoder() noexcept = default;
+
+ explicit RleRunToBitmapDecoder(const RunType& run) noexcept { Reset(run); }
+
+ void Reset(const RunType& run) noexcept {
+ values_left_ = run.values_count();
+ if (run.value_little_endian() == 0) {
+ value_pattern_ = uint8_t{0};
+ } else {
+ value_pattern_ = uint8_t{0xFF};
+ }
+ }
+
+ /// Return the number of values that can be advanced.
+ rle_size_t remaining() const { return values_left_; }
+
+ /// Return the repeated value of this decoder.
+ constexpr bool value() const { return value_pattern_ != 0; }
+
+ /// Return how much the decoder would advance if asked to.
+ ///
+ /// Does not modify input.
+ rle_size_t AdvanceCapacity(rle_size_t batch_size) const noexcept {
+ const auto n_vals = std::min(batch_size, remaining());
+ return n_vals;
+ }
+
+ /// Advance by as many values as provided or until exhaustion of the decoder.
+ /// Return the number of values skipped.
+ [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
+ const auto n_vals = AdvanceCapacity(batch_size);
+ values_left_ -= n_vals;
+ ARROW_DCHECK_GE(remaining(), 0);
+ return n_vals;
+ }
+
+ /// Get the next value and return false if there are no more.
+ [[nodiscard]] bool Get(BitmapSpanMut out) { return GetBatch(out, 1) == 1; }
+
+ /// Get a batch of values return the number of decoded elements.
Review Comment:
```suggestion
/// Get a batch of values into `out` and return the number of decoded
elements.
```
##########
cpp/src/arrow/util/rle_bitmap_test.cc:
##########
@@ -0,0 +1,598 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/rle_bitmap_internal.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+namespace {
+
+/// Make a vector of `size` pseudo-random bytes, deterministic for a given
`seed`.
+std::vector<uint8_t> MakeRandomBytes(size_t size, uint32_t seed = 56) {
+ std::vector<uint8_t> bytes(size);
+ std::minstd_rand gen(seed);
+ std::uniform_int_distribution<uint8_t> dist(0, 255);
+ for (auto& byte : bytes) {
+ byte = dist(gen);
+ }
+ return bytes;
+}
+
+/// Read the first `count` bits of `bytes` (LSB first) into a vector of
booleans.
+std::vector<bool> BitsFromBytes(const std::vector<uint8_t>& bytes, rle_size_t
count) {
+ std::vector<bool> bits(count);
+ for (rle_size_t i = 0; i < count; ++i) {
+ bits[i] = bit_util::GetBit(bytes.data(), i);
+ }
+ return bits;
+}
+
+struct CheckDecodedBitsParams {
+ const std::vector<uint8_t>& actual;
+ const std::vector<bool>& expected;
+ rle_size_t count;
+ rle_size_t actual_start_bit = 0;
+ rle_size_t expected_start_idx = 0;
+};
+
+/// Check the decoded output in `out` against `expected`.
+void CheckDecodedBits(const CheckDecodedBitsParams& params) {
+ ARROW_SCOPED_TRACE("out_start_bit = ", params.actual_start_bit,
+ ", expected_start_idx = ", params.expected_start_idx);
+ for (rle_size_t i = 0; i < params.count; ++i) {
+ ASSERT_EQ(bit_util::GetBit(params.actual.data(), params.actual_start_bit +
i),
+ params.expected[params.expected_start_idx + i])
+ << "first difference at bit " << i;
+ }
+}
+
+struct CheckBitsEqualParams {
+ const std::vector<uint8_t>& actual;
+ const std::vector<uint8_t>& expected;
+ rle_size_t count;
+ rle_size_t actual_start_bit = 0;
+ rle_size_t expected_start_bit = 0;
+};
+
+/// Check that two bit ranges, stored in `actual` and `expected`, are equal.
+void CheckBitsEqual(const CheckBitsEqualParams& params) {
+ ARROW_SCOPED_TRACE("actual_start_bit = ", params.actual_start_bit,
+ ", expected_start_bit = ", params.expected_start_bit);
+ for (rle_size_t i = 0; i < params.count; ++i) {
+ ASSERT_EQ(bit_util::GetBit(params.actual.data(), params.actual_start_bit +
i),
+ bit_util::GetBit(params.expected.data(),
params.expected_start_bit + i))
+ << "first difference at bit " << i;
+ }
+}
+
+/// Skip the first `expected_skip` values with Advance(), then decode the rest
of the run
+/// into one output bitmap, `chunk_size` values at a time. Compare against
`expected`.
+///
+/// `chunk_size` controls output bit alignment. When `chunk_size` is not a
multiple of 8,
+/// later calls start at a non-zero output bit offset.
+///
+/// `expected_skip` shifts the decoder's read offset relative to the output
offset.
+/// A non-zero `expected_skip` makes the two differ, which exercises the
bit-unaligned
+/// read path of BitPackedRunToBitmapDecoder. With `expected_skip == 0` they
stay in sync
+/// and only the aligned path runs.
+template <typename Decoder>
+void CheckDecoderValuesChunked(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected,
+ rle_size_t chunk_size = 1, rle_size_t
expected_skip = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", expected_skip = ",
expected_skip);
+
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_LE(expected_skip, n_vals);
+
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(expected_skip);
+ ASSERT_EQ(advanced, expected_skip);
+ const auto n_vals_to_decode = n_vals - expected_skip;
+
+ // Output buffer
+ const auto n_bytes =
static_cast<size_t>(bit_util::BytesForBits(n_vals_to_decode));
+ std::vector<uint8_t> out(n_bytes, 0);
+
+ rle_size_t n_val_read = 0;
+ while (n_val_read < n_vals_to_decode) {
+ const auto want = std::min(chunk_size, n_vals_to_decode - n_val_read);
+ const auto got =
+ decoder.GetBatch(BitmapSpanMut(out.data(), /*bit_start=*/n_val_read),
want);
+ EXPECT_EQ(got, want) << "at pos " << n_val_read;
+ ASSERT_GT(got, 0) << "at pos " << n_val_read; // break on failure
+ n_val_read += got;
+ EXPECT_EQ(decoder.remaining(), n_vals_to_decode - n_val_read);
+ }
+
+ EXPECT_EQ(decoder.remaining(), 0);
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = n_vals_to_decode,
+ .actual_start_bit = 0,
+ .expected_start_idx = expected_skip,
+ });
+}
+
+/// Decode a chunk of data into a known output to check for out of bounds
write.
+///
+/// @see CheckDecoderValuesChunked
+template <typename Decoder>
+void CheckDecoderClobber(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected, rle_size_t
chunk_size = 1,
+ rle_size_t expected_skip = 0) {
+ ARROW_SCOPED_TRACE("chunk_size = ", chunk_size, ", expected_skip = ",
expected_skip);
+
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+ ASSERT_LE(expected_skip, n_vals);
+
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(expected_skip);
+ ASSERT_EQ(advanced, expected_skip);
+ const auto n_vals_to_decode = n_vals - expected_skip;
+
+ // Output buffer with enough capacity to store a full chunk plus extra bytes
as
+ // clobbers/guard to check for out of bounds write.
+ const auto n_bytes = static_cast<size_t>(bit_util::BytesForBits(chunk_size) +
+ bit_util::CeilDiv(n_vals,
chunk_size) + 2);
+ // This seed is arbitrary and of little importance. We are simply trying to
avoid an
+ // unlikely case where guards have the same pattern in all invocations.
+ const auto out_pattern =
+ MakeRandomBytes(n_bytes, /* seed= */ (chunk_size << 16) ^ expected_skip);
+ auto out = out_pattern;
+
+ rle_size_t n_val_read = 0;
+ rle_size_t out_bit_start = 0;
+ while (n_val_read < n_vals_to_decode) {
+ // Clean output buffer
+ out = out_pattern;
+ const auto want = std::min(chunk_size, n_vals_to_decode - n_val_read);
+ const auto got = decoder.GetBatch(BitmapSpanMut(out.data(),
out_bit_start), want);
+ ASSERT_GT(got, 0) << "at pos " << n_val_read; // break on failure
+ EXPECT_EQ(got, want) << "at pos " << n_val_read;
+ // Check that the leading bits have not been modified
+ CheckBitsEqual({.actual = out, .expected = out_pattern, .count =
out_bit_start});
+ // Check that the trailing bits have not been modified
+ CheckBitsEqual({
+ .actual = out,
+ .expected = out_pattern,
+ .count = static_cast<rle_size_t>(8 * n_bytes) - (out_bit_start + want),
+ .actual_start_bit = out_bit_start + want,
+ .expected_start_bit = out_bit_start + want,
+ });
+ // Check decoded bits are also correct
+ CheckDecodedBits({
+ .actual = out,
+ .expected = expected,
+ .count = want,
+ .actual_start_bit = out_bit_start,
+ .expected_start_idx = expected_skip + n_val_read,
+ });
+
+ n_val_read += got;
+ ++out_bit_start;
+ EXPECT_EQ(decoder.remaining(), n_vals_to_decode - n_val_read);
+ }
+}
+
+/// All the checks shared by both decoder types.
+///
+/// `expected` is the full sequence of booleans the run should decode to.
+template <typename Decoder>
+void CheckBitmapDecoder(const typename Decoder::RunType& run,
+ const std::vector<bool>& expected) {
+ const auto n_vals = static_cast<rle_size_t>(expected.size());
+
+ // remaining() reflects the run size before any value is read.
+ {
+ Decoder decoder(run);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ }
+
+ // Empty requests are a no-op.
+ {
+ Decoder decoder(run);
+ uint8_t out = 0;
+ const auto got = decoder.GetBatch(BitmapSpanMut(&out), /*batch_size=*/0);
+ EXPECT_EQ(got, 0);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ }
+
+ // Decode the whole run in several chunks.
+ for (const rle_size_t chunk_size : {rle_size_t{1}, rle_size_t{3},
rle_size_t{7},
+ rle_size_t{8}, rle_size_t{9}, n_vals,
n_vals + 1}) {
+ CheckDecoderValuesChunked<Decoder>(run, expected, chunk_size);
+ }
+
+ // Decode the whole run in several chunks, after an initial Advance that
shifts
+ // the run and output bit alignment.
+ for (const rle_size_t chunk_size : {rle_size_t{1}, rle_size_t{3},
rle_size_t{7},
+ rle_size_t{8}, rle_size_t{9}, n_vals,
n_vals + 1}) {
+ for (rle_size_t expected_skip = 1; expected_skip < 8 && expected_skip <
n_vals;
+ ++expected_skip) {
+ // Check the decoding happens as expected
+ CheckDecoderValuesChunked<Decoder>(run, expected, chunk_size,
expected_skip);
+ // Check the decoding does not write out of bounds
+ CheckDecoderClobber<Decoder>(run, expected, chunk_size, expected_skip);
+ }
+ }
+
+ // Get() one value at a time, then read past the end.
+ {
+ Decoder decoder(run);
+ std::vector<uint8_t>
out(static_cast<size_t>(bit_util::BytesForBits(n_vals)) + 1, 0);
+ for (rle_size_t i = 0; i < n_vals; ++i) {
+ const bool ok = decoder.Get(BitmapSpanMut(out.data(), /*bit_start=*/i));
+ EXPECT_TRUE(ok);
+ EXPECT_EQ(decoder.remaining(), n_vals - i - 1);
+ }
+ // Exhausted: nothing more can be read or advanced.
+ const bool ok = decoder.Get(BitmapSpanMut(out.data()));
+ EXPECT_FALSE(ok);
+ const auto advanced = decoder.Advance(1);
+ EXPECT_EQ(advanced, 0);
+ EXPECT_EQ(decoder.remaining(), 0);
+ CheckDecodedBits({.actual = out, .expected = expected, .count = n_vals});
+ }
+
+ // Advancing more than available stops at the run boundary.
+ {
+ Decoder decoder(run);
+ const auto advanced = decoder.Advance(n_vals + 100);
+ EXPECT_EQ(advanced, n_vals);
+ EXPECT_EQ(decoder.remaining(), 0);
+ }
+
+ // Reset() rewinds the decoder so the run can be decoded again.
+ {
+ Decoder decoder(run);
+ std::vector<uint8_t>
out_1(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+ const auto scratch_got = decoder.GetBatch(BitmapSpanMut(out_1.data()),
n_vals);
+ EXPECT_EQ(scratch_got, n_vals);
+ EXPECT_EQ(decoder.remaining(), 0);
+
+ decoder.Reset(run);
+ EXPECT_EQ(decoder.remaining(), n_vals);
+ std::vector<uint8_t>
out_2(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+ const auto got = decoder.GetBatch(BitmapSpanMut(out_2.data()), n_vals);
+ EXPECT_EQ(got, n_vals);
+ CheckDecodedBits({.actual = out_2, .expected = expected, .count = n_vals});
+ }
+}
+
+} // namespace
+
+/***************************
+ * RleRunToBitmapDecoder *
+ ***************************/
+
+class RleRunToBitmapDecoderTest : public ::testing::TestWithParam<rle_size_t>
{};
+
+TEST_P(RleRunToBitmapDecoderTest, Decode) {
+ const auto& count = GetParam();
+
+ // Only two possible repeated value
+ for (bool value : {true, false}) {
+ ARROW_SCOPED_TRACE("value = ", value);
+
+ // A boolean RLE run stores its value in a single (1-bit-wide) byte.
+ const uint8_t data = value ? 1 : 0;
+ const auto run = RleRun(&data, count, /*value_bit_width=*/1);
+
+ // value() reports the repeated boolean.
+ RleRunToBitmapDecoder decoder(run);
+ EXPECT_EQ(decoder.value(), value);
+
+ const std::vector<bool> expected(count, value);
+ CheckBitmapDecoder<RleRunToBitmapDecoder>(run, expected);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P( //
+ RleBitmap, RleRunToBitmapDecoderTest,
+ ::testing::Values(0, 1, 3, 8, 9, 13, 64, 100, 100, 1000));
Review Comment:
Why twice `100`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]