AntoinePrv commented on code in PR #50217:
URL: https://github.com/apache/arrow/pull/50217#discussion_r3442881221


##########
cpp/src/arrow/util/rle_bitmap_test.cc:
##########
@@ -0,0 +1,508 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/rle_bitmap_internal.h"
+#include "arrow/util/rle_encoding_internal.h"
+
+namespace arrow::util {
+
+namespace {
+
+/// Read the first `count` bits of `bytes` (LSB first) into a vector of 
booleans.
+std::vector<bool> BitsFromBytes(const std::vector<uint8_t>& bytes, rle_size_t 
count) {
+  std::vector<bool> bits(count);
+  for (rle_size_t i = 0; i < count; ++i) {
+    bits[i] = bit_util::GetBit(bytes.data(), i);
+  }
+  return bits;
+}
+
+/// Check the decoded output in `out` against `expected`.
+/// Bits `out[out_offset..out_offset + count]` must equal `expected[skip..skip 
+ count]`.
+/// The `out_offset` bits before them must still be zero.
+void CheckDecodedBits(const std::vector<uint8_t>& out, const 
std::vector<bool>& expected,
+                      rle_size_t count, rle_size_t out_offset = 0, rle_size_t 
skip = 0) {
+  ARROW_SCOPED_TRACE("out_offset = ", out_offset, ", skip = ", skip);
+  for (rle_size_t i = 0; i < out_offset; ++i) {
+    EXPECT_FALSE(bit_util::GetBit(out.data(), i)) << "clobbered bit " << i;
+  }
+  for (rle_size_t i = 0; i < count; ++i) {
+    EXPECT_EQ(bit_util::GetBit(out.data(), out_offset + i), expected[skip + i])
+        << "at bit " << i;
+  }
+}
+
+/// Skip the first `skip` values with Advance(), then decode the rest of the 
run
+/// into one output bitmap, `chunk` values at a time. Compare against 
`expected`.
+///
+/// `chunk` controls output bit alignment. When `chunk` is not a multiple of 8,
+/// later calls start at a non-zero output bit offset.
+///
+/// `skip` shifts the decoder's read offset relative to the output offset.
+/// A non-zero `skip` makes the two differ, which exercises the bit-unaligned 
read
+/// path of BitPackedRunToBitmapDecoder. With `skip == 0` they stay in sync and
+/// only the aligned path runs.
+template <typename Decoder>
+void CheckChunkedDecode(const typename Decoder::RunType& run,
+                        const std::vector<bool>& expected, rle_size_t chunk = 
1,
+                        rle_size_t skip = 0) {
+  ARROW_SCOPED_TRACE("chunk = ", chunk, ", skip = ", skip);
+  const auto n_vals = static_cast<rle_size_t>(expected.size());
+  ASSERT_LE(skip, n_vals);
+
+  Decoder decoder(run);
+  const auto advanced = decoder.Advance(skip);
+  ASSERT_EQ(advanced, skip);
+  const auto rest = n_vals - skip;
+
+  // Output buffer with one guard byte to catch out-of-bounds writes.
+  std::vector<uint8_t> out(static_cast<size_t>(bit_util::BytesForBits(rest)) + 
1, 0);
+  const uint8_t guard = 0xA5;
+  out.back() = guard;
+
+  rle_size_t read = 0;
+  while (read < rest) {
+    const auto want = std::min(chunk, rest - read);
+    const auto got =
+        decoder.GetBatch(BitmapSpanMut(out.data(), /*bit_start=*/read), want);
+    EXPECT_EQ(got, want) << "at pos " << read;
+    ASSERT_GT(got, 0) << "at pos " << read;  // break on failure
+    read += got;
+    EXPECT_EQ(decoder.remaining(), rest - read);
+  }
+
+  EXPECT_EQ(decoder.remaining(), 0);
+  EXPECT_EQ(out.back(), guard) << "decoder wrote past the end of the output";
+  CheckDecodedBits(out, expected, /*count=*/rest, /*out_offset=*/0, skip);
+}
+
+/// All the checks shared by both decoder types.
+///
+/// `expected` is the full sequence of booleans the run should decode to.
+template <typename Decoder>
+void CheckBitmapDecoder(const typename Decoder::RunType& run,
+                        const std::vector<bool>& expected) {
+  const auto n_vals = static_cast<rle_size_t>(expected.size());
+
+  // remaining() reflects the run size before any value is read.
+  {
+    Decoder decoder(run);
+    EXPECT_EQ(decoder.remaining(), n_vals);
+  }
+
+  // Empty requests are a no-op.
+  {
+    Decoder decoder(run);
+    uint8_t out = 0;
+    const auto got = decoder.GetBatch(BitmapSpanMut(&out), /*batch_size=*/0);
+    EXPECT_EQ(got, 0);
+    EXPECT_EQ(decoder.remaining(), n_vals);
+  }
+
+  // Decode the whole run in several chunks.
+  for (const rle_size_t chunk : {rle_size_t{1}, rle_size_t{3}, rle_size_t{7},
+                                 rle_size_t{8}, rle_size_t{9}, n_vals}) {
+    CheckChunkedDecode<Decoder>(run, expected, chunk);
+  }
+
+  // Decode the whole run in several chunks, after an initial Advance that 
shifts
+  // the run and output bit alignment.
+  for (const rle_size_t chunk : {rle_size_t{1}, rle_size_t{3}, rle_size_t{7},
+                                 rle_size_t{8}, rle_size_t{9}, n_vals}) {
+    for (rle_size_t skip = 1; skip < 8 && skip < n_vals; ++skip) {
+      CheckChunkedDecode<Decoder>(run, expected, chunk, skip);
+    }
+  }
+
+  // Get() one value at a time, then read past the end.
+  {
+    Decoder decoder(run);
+    std::vector<uint8_t> 
out(static_cast<size_t>(bit_util::BytesForBits(n_vals)) + 1, 0);
+    for (rle_size_t i = 0; i < n_vals; ++i) {
+      const bool ok = decoder.Get(BitmapSpanMut(out.data(), /*bit_start=*/i));
+      EXPECT_TRUE(ok);
+      EXPECT_EQ(decoder.remaining(), n_vals - i - 1);
+    }
+    // Exhausted: nothing more can be read or advanced.
+    const bool ok = decoder.Get(BitmapSpanMut(out.data()));
+    EXPECT_FALSE(ok);
+    const auto advanced = decoder.Advance(1);
+    EXPECT_EQ(advanced, 0);
+    EXPECT_EQ(decoder.remaining(), 0);
+    CheckDecodedBits(out, expected, /*count=*/n_vals);
+  }
+
+  // Advancing more than available stops at the run boundary.
+  {
+    Decoder decoder(run);
+    const auto advanced = decoder.Advance(n_vals + 100);
+    EXPECT_EQ(advanced, n_vals);
+    EXPECT_EQ(decoder.remaining(), 0);
+  }
+
+  // Reset() rewinds the decoder so the run can be decoded again.
+  {
+    Decoder decoder(run);
+    std::vector<uint8_t> 
out_1(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+    const auto scratch_got = decoder.GetBatch(BitmapSpanMut(out_1.data()), 
n_vals);
+    EXPECT_EQ(scratch_got, n_vals);
+    EXPECT_EQ(decoder.remaining(), 0);
+
+    decoder.Reset(run);
+    EXPECT_EQ(decoder.remaining(), n_vals);
+    std::vector<uint8_t> 
out_2(static_cast<size_t>(bit_util::BytesForBits(n_vals)), 0);
+    const auto got = decoder.GetBatch(BitmapSpanMut(out_2.data()), n_vals);
+    EXPECT_EQ(got, n_vals);
+    CheckDecodedBits(out_2, expected, /*count=*/n_vals);
+  }
+}
+
+}  // namespace
+
+/***************************
+ *  RleRunToBitmapDecoder  *
+ ***************************/
+
+struct RleBitmapCase {
+  // The repeated boolean value of the run.
+  bool value;
+  // The number of values in the run.
+  rle_size_t count;

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to