AntoinePrv commented on code in PR #47294:
URL: https://github.com/apache/arrow/pull/47294#discussion_r2349567807


##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -84,32 +85,278 @@ namespace util {
 /// (total 26 bytes, 1 byte overhead)
 //
 
+class RleRun {
+ public:
+  using byte = uint8_t;
+  /// Enough space to store a 64bit value
+  using raw_data_storage = std::array<byte, 8>;
+  using raw_data_const_pointer = const byte*;
+  using raw_data_size_type = int32_t;
+  /// The type of the size of either run, between 1 and 2^31-1 as per Parquet 
spec
+  using values_count_type = int32_t;
+  /// The type to represent a size in bits
+  using bit_size_type = int32_t;
+
+  constexpr RleRun() noexcept = default;
+  constexpr RleRun(RleRun const&) noexcept = default;
+  constexpr RleRun(RleRun&&) noexcept = default;
+
+  explicit RleRun(raw_data_const_pointer data, values_count_type values_count,
+                  bit_size_type value_bit_width) noexcept;
+
+  constexpr RleRun& operator=(RleRun const&) noexcept = default;
+  constexpr RleRun& operator=(RleRun&&) noexcept = default;
+
+  /// The number of repeated values in this run.
+  [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept;
+
+  /// The size in bits of each encoded value.
+  [[nodiscard]] constexpr bit_size_type ValuesBitWidth() const noexcept;
+
+  /// A pointer to the repeated value raw bytes.
+  [[nodiscard]] constexpr raw_data_const_pointer RawDataPtr() const noexcept;
+
+  /// The number of bytes used for the raw repeated value.
+  [[nodiscard]] constexpr raw_data_size_type RawDataSize() const noexcept;
+
+ private:
+  /// The repeated value raw bytes stored inside the class
+  raw_data_storage data_ = {};
+  /// The number of time the value is repeated
+  values_count_type values_count_ = 0;
+  /// The size in bit of a packed value in the run
+  bit_size_type value_bit_width_ = 0;
+};
+
+class BitPackedRun {
+ public:
+  using byte = uint8_t;
+  using raw_data_const_pointer = const byte*;
+  /// According to the Parquet thrift definition the page size can be written 
into an
+  /// int32_t.
+  using raw_data_size_type = int32_t;
+  /// The type of the size of either run, between 1 and 2^31-1 as per Parquet 
spec
+  using values_count_type = int32_t;
+  /// The type to represent a size in bits
+  using bit_size_type = int32_t;
+
+  constexpr BitPackedRun() noexcept = default;
+  constexpr BitPackedRun(BitPackedRun const&) noexcept = default;
+  constexpr BitPackedRun(BitPackedRun&&) noexcept = default;
+
+  constexpr BitPackedRun(raw_data_const_pointer data, values_count_type 
values_count,
+                         bit_size_type value_bit_width) noexcept;
+
+  constexpr BitPackedRun& operator=(BitPackedRun const&) noexcept = default;
+  constexpr BitPackedRun& operator=(BitPackedRun&&) noexcept = default;
+
+  [[nodiscard]] constexpr values_count_type ValuesCount() const noexcept;
+
+  /// The size in bits of each encoded value.
+  [[nodiscard]] constexpr bit_size_type ValuesBitWidth() const noexcept;
+
+  [[nodiscard]] constexpr raw_data_const_pointer RawDataPtr() const noexcept;
+
+  [[nodiscard]] constexpr raw_data_size_type RawDataSize() const noexcept;
+
+ private:
+  /// The pointer to the beginning of the run
+  raw_data_const_pointer data_ = nullptr;
+  /// Number of values in this run.
+  raw_data_size_type values_count_ = 0;
+  /// The size in bit of a packed value in the run
+  bit_size_type value_bit_width_ = 0;
+};
+
+/// A parser that emits either a ``BitPackedRun`` or a ``RleRun``.
+class RleBitPackedParser {
+ public:
+  using byte = uint8_t;
+  using raw_data_const_pointer = const byte*;
+  /// By Parquet thrift definition the page size can be written into an 
int32_t.
+  using raw_data_size_type = int32_t;
+  /// The type to represent a size in bits
+  using bit_size_type = int32_t;
+  /// The different types of runs emitted by the parser
+  using dynamic_run_type = std::variant<RleRun, BitPackedRun>;
+
+  constexpr RleBitPackedParser() noexcept = default;
+
+  constexpr RleBitPackedParser(raw_data_const_pointer data, raw_data_size_type 
data_size,
+                               bit_size_type value_bit_width) noexcept;
+
+  constexpr void Reset(raw_data_const_pointer data, raw_data_size_type 
data_size,
+                       bit_size_type value_bit_width_) noexcept;
+
+  /// Get the current run with a small parsing cost without advancing the 
iteration.
+  [[nodiscard]] std::optional<dynamic_run_type> Peek() const;
+
+  /// Move to the next run.
+  [[nodiscard]] bool Advance();
+
+  /// Advance and return the current run.
+  [[nodiscard]] std::optional<dynamic_run_type> Next();
+
+  /// Whether there is still runs to iterate over.
+  ///
+  /// WARN: Due to lack of proper error handling, iteration with Next and Peek 
could
+  /// return not data while the parser is not exhausted.
+  /// This is how one can check for errors.
+  [[nodiscard]] bool Exhausted() const;
+
+  /// Enum to return from an ``Parse`` handler.
+  ///
+  /// Since a callback has no way to know when to stop, the handler must return
+  /// a value indicating to the ``Parse`` function whether to stop or continue.
+  enum class ControlFlow {
+    Continue,
+    Break,
+  };
+
+  /// A callback approach to parsing.
+  ///
+  /// This approach is used to reduce the number of dynamic lookups involved 
with using a
+  /// variant.
+  ///
+  /// The handler must be of the form
+  /// ```cpp`
+  /// struct Handler {
+  ///   ControlFlow OnBitPackedRun(BitPackedRun run);
+  ///
+  ///   ControlFlow OnRleRun(RleRun run);
+  /// };
+  /// ```
+  template <typename Handler>
+  void Parse(Handler&& handler);
+
+ private:
+  /// The pointer to the beginning of the run
+  raw_data_const_pointer data_ = nullptr;
+  /// Size in bytes of the run.
+  raw_data_size_type data_size_ = 0;
+  /// The size in bit of a packed value in the run
+  bit_size_type value_bit_width_ = 0;
+
+  /// Run the handler on the run read and return the number of values read.
+  /// Does not advance the parser.
+  template <typename Handler>
+  std::pair<raw_data_size_type, ControlFlow> PeekImpl(Handler&&) const;
+};
+
 /// Decoder class for RLE encoded data.
+template <typename T>
 class RleDecoder {
  public:
-  /// Create a decoder object. buffer/buffer_len is the decoded data.
-  /// bit_width is the width of each value (before encoding).
-  RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
-      : bit_reader_(buffer, buffer_len),
-        bit_width_(bit_width),
-        current_value_(0),
-        repeat_count_(0),
-        literal_count_(0) {
-    ARROW_DCHECK_GE(bit_width_, 0);
-    ARROW_DCHECK_LE(bit_width_, 64);
-  }
+  /// The type in which the data should be decoded.
+  using value_type = T;
+  /// The type of run that can be decoded.
+  using run_type = RleRun;
+  using values_count_type = run_type::values_count_type;
 
-  RleDecoder() : bit_width_(-1) {}
+  constexpr RleDecoder() noexcept = default;
 
-  void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
-    ARROW_DCHECK_GE(bit_width, 0);
-    ARROW_DCHECK_LE(bit_width, 64);
-    bit_reader_.Reset(buffer, buffer_len);
-    bit_width_ = bit_width;
-    current_value_ = 0;
-    repeat_count_ = 0;
-    literal_count_ = 0;
-  }
+  explicit RleDecoder(run_type const& run) noexcept;
+
+  void Reset(run_type const& run) noexcept;
+
+  /// Return the number of values that can be advanced.
+  [[nodiscard]] values_count_type Remaining() const;

Review Comment:
   Yes, I have applied them liberally in all cases where "*it does nothing if 
you call this function and not use the output*".
   
   > For now, we only use `[[nodiscard]]` in cases where it is dangerous to 
ignore a return value (such as an error).
   
   For all functions reading data, I'd consider it dangerous not to read the 
output (e.g. number of bytes read) since it carries error information (e.g. 
when it is zero). What do you think?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to