AntoinePrv commented on code in PR #47294:
URL: https://github.com/apache/arrow/pull/47294#discussion_r2368427502
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -82,86 +81,400 @@ namespace util {
/// 200 ints = 25 groups of 8
/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
/// (total 26 bytes, 1 byte overhead)
-//
-/// Decoder class for RLE encoded data.
-class RleDecoder {
+/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as
per Parquet
+/// spec.
+/// This is also pragmatically used for other integer used in the Rle and
BitPacked runs
+/// and decoder to avoid conversions.
+/// It can therefore be referred to as a "typical" size for Rle and BitPacked
logic.
+using rle_size_t = int32_t;
+
+template <typename T>
+class RleRunDecoder;
+
+class RleRun {
public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- ARROW_DCHECK_GE(bit_width_, 0);
- ARROW_DCHECK_LE(bit_width_, 64);
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = RleRunDecoder<T>;
+
+ constexpr RleRun() noexcept = default;
+
+ explicit RleRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : values_count_(values_count), value_bit_width_(value_bit_width) {
+ ARROW_DCHECK_GE(value_bit_width, 0);
+ ARROW_DCHECK_GE(values_count, 0);
+ std::copy(data, data + raw_data_size(), data_.begin());
}
- RleDecoder() : bit_width_(-1) {}
+ /// The number of repeated values in this run.
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- ARROW_DCHECK_GE(bit_width, 0);
- ARROW_DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ /// A pointer to the repeated value raw bytes.
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return
data_.data(); }
+
+ /// The number of bytes used for the raw repeated value.
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(value_bit_width_);
+ ARROW_DCHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
}
- /// Gets the next value. Returns false if there are no more.
+ private:
+ /// The repeated value raw bytes stored inside the class with enough space
to store
+ /// up to a 64 bit value.
+ std::array<uint8_t, 8> data_ = {};
+ /// The number of time the value is repeated.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run.
+ rle_size_t value_bit_width_ = 0;
+};
+
+template <typename T>
+class BitPackedRunDecoder;
+
+class BitPackedRun {
+ public:
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = BitPackedRunDecoder<T>;
+
+ constexpr BitPackedRun() noexcept = default;
+
+ constexpr BitPackedRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : data_(data), values_count_(values_count),
value_bit_width_(value_bit_width) {
+ ARROW_CHECK_GE(value_bit_width_, 0);
+ ARROW_CHECK_GE(values_count_, 0);
+ }
+
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
+
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return data_; }
+
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(static_cast<int64_t>(value_bit_width_) *
+ static_cast<int64_t>(values_count_));
+ ARROW_CHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
+ }
+
+ private:
+ /// The pointer to the beginning of the run
+ const uint8_t* data_ = nullptr;
+ /// Number of values in this run.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run
+ rle_size_t value_bit_width_ = 0;
+};
+
+/// A parser that emits either a ``BitPackedRun`` or a ``RleRun``.
+class RleBitPackedParser {
+ public:
+ /// The different types of runs emitted by the parser
+ using dynamic_run_type = std::variant<RleRun, BitPackedRun>;
+
+ constexpr RleBitPackedParser() noexcept = default;
+
+ constexpr RleBitPackedParser(const uint8_t* data, rle_size_t data_size,
+ rle_size_t value_bit_width) noexcept
+ : data_(data), data_size_(data_size), value_bit_width_(value_bit_width)
{}
+
+ constexpr void Reset(const uint8_t* data, rle_size_t data_size,
+ rle_size_t value_bit_width) noexcept {
+ *this = {data, data_size, value_bit_width};
+ }
+
+ /// Whether there is still runs to iterate over.
+ ///
+ /// WARN: Due to simplistic error handling, iteration with Next and Peek
could
+ /// fail to return data while the parser is not exhausted.
+ /// This is how one can check for errors.
+ bool exhausted() const { return data_size_ == 0; }
+
+ /// Enum to return from an ``Parse`` handler.
+ ///
+ /// Since a callback has no way to know when to stop, the handler must return
+ /// a value indicating to the ``Parse`` function whether to stop or continue.
+ enum class ControlFlow {
+ Continue,
+ Break,
+ };
+
+ /// A callback approach to parsing.
+ ///
+ /// This approach is used to reduce the number of dynamic lookups involved
with using a
+ /// variant.
+ ///
+ /// The handler must be of the form
+ /// ```cpp
+ /// struct Handler {
+ /// ControlFlow OnBitPackedRun(BitPackedRun run);
+ ///
+ /// ControlFlow OnRleRun(RleRun run);
+ /// };
+ /// ```
+ template <typename Handler>
+ void Parse(Handler&& handler);
+
+ private:
+ /// The pointer to the beginning of the run
+ const uint8_t* data_ = nullptr;
+ /// Size in bytes of the run.
+ rle_size_t data_size_ = 0;
+ /// The size in bit of a packed value in the run
+ rle_size_t value_bit_width_ = 0;
Review Comment:
Actually we cannot "reconstruct a parser" through assignment if one of the
member is const, so this is not easily doable.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]