pitrou commented on code in PR #47294: URL: https://github.com/apache/arrow/pull/47294#discussion_r2322130602
########## cpp/src/arrow/util/rle_encoding_test.cc: ########## @@ -207,12 +209,310 @@ TEST(BitUtil, RoundTripIntValues) { } } +/// A Rle run is a simple class owning some data and a repetition count. +/// It does not know how to read such data. +TEST(Rle, RleRun) { + const std::array<RleRun::byte, 4> value = {21, 2, 0, 0}; + + RleRun::values_count_type value_count = 12; + + // 12 times the value 21 fitting over 5 bits + auto const run_5 = RleRun(value.data(), value_count, /* value_bit_width= */ 5); + EXPECT_EQ(run_5.ValuesCount(), value_count); + EXPECT_EQ(run_5.ValuesBitWidth(), 5); + EXPECT_EQ(run_5.RawDataSize(), 1); // 5 bits fit in one byte + EXPECT_EQ(*run_5.RawDataPtr(), 21); + + // 12 times the value 21 fitting over 16 bits + auto const run_8 = RleRun(value.data(), value_count, /* value_bit_width= */ 8); + EXPECT_EQ(run_8.ValuesCount(), value_count); + EXPECT_EQ(run_8.ValuesBitWidth(), 8); + EXPECT_EQ(run_8.RawDataSize(), 1); // 8 bits fit in 1 byte + EXPECT_EQ(*run_8.RawDataPtr(), 21); + + // 12 times the value {21, 2} fitting over 10 bits + auto const run_10 = RleRun(value.data(), value_count, /* value_bit_width= */ 10); + + EXPECT_EQ(run_10.ValuesCount(), value_count); + EXPECT_EQ(run_10.ValuesBitWidth(), 10); + EXPECT_EQ(run_10.RawDataSize(), 2); // 10 bits fit in 2 bytes + EXPECT_EQ(*(run_10.RawDataPtr() + 0), 21); + EXPECT_EQ(*(run_10.RawDataPtr() + 1), 2); + + // 12 times the value {21, 2} fitting over 32 bits + auto const run_32 = RleRun(value.data(), value_count, /* value_bit_width= */ 32); + EXPECT_EQ(run_32.ValuesCount(), value_count); + EXPECT_EQ(run_32.ValuesBitWidth(), 32); + EXPECT_EQ(run_32.RawDataSize(), 4); // 32 bits fit in 4 bytes + EXPECT_EQ(*(run_32.RawDataPtr() + 0), 21); + EXPECT_EQ(*(run_32.RawDataPtr() + 1), 2); + EXPECT_EQ(*(run_32.RawDataPtr() + 2), 0); + EXPECT_EQ(*(run_32.RawDataPtr() + 3), 0); +} + +/// A BitPacked run is a simple class owning some data and its size. +/// It does not know how to read such data. +TEST(BitPacked, BitPackedRun) { + const std::array<BitPackedRun::byte, 4> value = {0b10101010, 0, 0, 0b1111111}; + + /// 16 values of 1 bit for a total of 16 bits + BitPackedRun::values_count_type value_count_1 = 16; + auto const run_1 = BitPackedRun(value.data(), value_count_1, /* value_bit_width= */ 1); + EXPECT_EQ(run_1.ValuesCount(), value_count_1); + EXPECT_EQ(run_1.ValuesBitWidth(), 1); + EXPECT_EQ(run_1.RawDataSize(), 2); // 16 bits fit in 2 bytes + for (BitPackedRun::raw_data_size_type i = 0; i < run_1.RawDataSize(); ++i) { + EXPECT_EQ(*(run_1.RawDataPtr() + i), value[i]); + } + + /// 8 values of 3 bits for a total of 24 bits + BitPackedRun::values_count_type value_count_3 = 8; + auto const run_3 = BitPackedRun(value.data(), value_count_3, /* value_bit_width= */ 3); + EXPECT_EQ(run_3.ValuesCount(), value_count_3); + EXPECT_EQ(run_3.ValuesBitWidth(), 3); + EXPECT_EQ(run_3.RawDataSize(), 3); // 24 bits fit in 3 bytes + for (BitPackedRun::raw_data_size_type i = 0; i < run_3.RawDataSize(); ++i) { + EXPECT_EQ(*(run_3.RawDataPtr() + i), value[i]); + } +} + +template <typename T> +void TestRleDecoder(std::vector<RleRun::byte> bytes, + RleRun::values_count_type value_count, + RleRun::bit_size_type bit_width) { + // Pre-requisite for this test + EXPECT_GT(value_count, 6); + + // Compute value associated with bytes encoded as little endian + T value = 0; + for (std::size_t i = 0; i < bytes.size(); ++i) { + value += static_cast<T>(bytes.at(i)) << (8 * i); + } + + auto const run = RleRun(bytes.data(), value_count, bit_width); + + auto decoder = RleDecoder<T>(run); + std::vector<T> vals = {0, 0}; + + EXPECT_EQ(decoder.Remaining(), value_count); + + typename decltype(decoder)::values_count_type read = 0; + EXPECT_EQ(decoder.Get(vals.data()), 1); + read += 1; + EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(decoder.Remaining(), value_count - read); + + EXPECT_EQ(decoder.Advance(3), 3); + read += 3; + EXPECT_EQ(decoder.Remaining(), value_count - read); + + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(vals.at(1), value); + read += static_cast<decltype(read)>(vals.size()); + EXPECT_EQ(decoder.Remaining(), value_count - read); + + // Exhaust iteration + EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); + EXPECT_EQ(decoder.Remaining(), 0); + EXPECT_EQ(decoder.Advance(1), 0); + vals = {0, 0}; + EXPECT_EQ(decoder.Get(vals.data()), 0); + EXPECT_EQ(vals.at(0), 0); + + // Reset the decoder + decoder.Reset(run); + EXPECT_EQ(decoder.Remaining(), value_count); + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), value); + EXPECT_EQ(vals.at(1), value); +} + +TEST(Rle, RleDecoder) { + TestRleDecoder<uint32_t>({21, 0, 0}, /* value_count= */ 21, /* bit_width= */ 5); + TestRleDecoder<uint16_t>({1, 0}, /* value_count= */ 13, /* bit_width= */ 1); + TestRleDecoder<uint64_t>({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30); +} + +template <typename T> +void TestBitPackedDecoder(std::vector<RleRun::byte> bytes, + BitPackedRun::values_count_type value_count, + BitPackedRun::bit_size_type bit_width, + std::vector<T> expected) { + // Pre-requisite for this test + EXPECT_GT(value_count, 6); + + auto const run = BitPackedRun(bytes.data(), value_count, bit_width); + + auto decoder = BitPackedDecoder<T>(run); + std::vector<T> vals = {0, 0}; + + EXPECT_EQ(decoder.Remaining(), value_count); + + typename decltype(decoder)::values_count_type read = 0; + EXPECT_EQ(decoder.Get(vals.data()), 1); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + read += 1; + EXPECT_EQ(decoder.Remaining(), value_count - read); + + EXPECT_EQ(decoder.Advance(3), 3); + read += 3; + EXPECT_EQ(decoder.Remaining(), value_count - read); + + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + EXPECT_EQ(vals.at(1), expected.at(1 + read)); + read += static_cast<decltype(read)>(vals.size()); + EXPECT_EQ(decoder.Remaining(), value_count - read); + + // Exhaust iteration + EXPECT_EQ(decoder.Advance(value_count - read), value_count - read); + EXPECT_EQ(decoder.Remaining(), 0); + EXPECT_EQ(decoder.Advance(1), 0); + vals = {0, 0}; + EXPECT_EQ(decoder.Get(vals.data()), 0); + EXPECT_EQ(vals.at(0), 0); + + // Reset the decoder + decoder.Reset(run); + read = 0; + EXPECT_EQ(decoder.Remaining(), value_count); + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2), vals.size()); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + EXPECT_EQ(vals.at(1), expected.at(1 + read)); +} + +TEST(BitPacked, BitPackedDecoder) { + /// See parquet encoding for bytes layout + TestBitPackedDecoder<uint16_t>( + /* bytes= */ {0x88, 0xc6, 0xfa}, + /* values_count= */ 8, + /* bit_width= */ 3, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + TestBitPackedDecoder<uint8_t>( + /* bytes= */ {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7}, + /* values_count= */ 8, + /* bit_width= */ 8, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + TestBitPackedDecoder<uint32_t>( + /* bytes= */ {0x47, 0xc, 0x10, 0x35}, + /* values_count= */ 8, + /* bit_width= */ 4, + /* expected= */ {7, 4, 12, 0, 0, 1, 5, 3}); + TestBitPackedDecoder<uint64_t>( + /* bytes= */ {0xe8, 0x7, 0x20, 0xc0, 0x0, 0x4, 0x14, 0x60, 0xc0, 0x1}, + /* values_count= */ 8, + /* bit_width= */ 10, + /* expected= */ {1000, 1, 2, 3, 4, 5, 6, 7}); +} + +template <typename T> +void TestRleBitPackedParser(std::vector<RleRun::byte> bytes, + RleBitPackedParser::bit_size_type bit_width, + std::vector<T> expected) { + auto parser = RleBitPackedParser( + bytes.data(), static_cast<BitPackedRun::raw_data_size_type>(bytes.size()), + bit_width); + EXPECT_FALSE(parser.Exhausted()); + + // Peek return the same data + auto run1 = parser.Peek(); + EXPECT_TRUE(run1.has_value()); + auto run2 = parser.Peek(); + EXPECT_TRUE(run2.has_value()); + auto ptr1 = std::visit([](auto const& r) { return r.RawDataPtr(); }, run1.value()); + auto size1 = std::visit([](auto const& r) { return r.RawDataSize(); }, run1.value()); + auto ptr2 = std::visit([](auto const& r) { return r.RawDataPtr(); }, run2.value()); + auto size2 = std::visit([](auto const& r) { return r.RawDataSize(); }, run2.value()); + EXPECT_TRUE(std::equal(ptr1, ptr1 + size1, ptr2, ptr2 + size2)); + EXPECT_FALSE(parser.Exhausted()); + + // Try to decode all data of all runs in the decoded vector + decltype(expected) decoded = {}; + auto rle_decoder = RleDecoder<T>(); + auto bit_packed_decoder = BitPackedDecoder<T>(); + // Iterate over all runs + while (auto run = parser.Next()) { + EXPECT_TRUE(run.has_value()); + + if (std::holds_alternative<RleRun>(run.value())) { + rle_decoder.Reset(std::get<RleRun>(run.value())); Review Comment: You could directly declare `rle_decoder` here, there is not point in having being in the outer scope. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org