(arrow) branch main updated: GH-48245: [C++][Parquet] Simplify GetVlqInt (#48237)

apitrou Tue, 25 Nov 2025 02:25:14 -0800

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 11d64b588e GH-48245: [C++][Parquet] Simplify GetVlqInt (#48237)
11d64b588e is described below

commit 11d64b588edc481a705fd13453a254252e343409
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Nov 25 11:25:02 2025 +0100

    GH-48245: [C++][Parquet] Simplify GetVlqInt (#48237)
    
    ### Rationale for this change
    
    The `BitReader::GetVlqInt` implementation currently tries to read first 
from the cached value before falling back to reading from the buffer.
    
    But this doesn't bring any benefit, since both code paths lead to the same 
processing step afterwards. So we can remove the code path that tries to read 
from the cached value. This will also make it easier to support big-endian 
platforms.
    
    ### Are these changes tested?
    
    Yes, by existing tests.
    
    ### Are there any user-facing changes?
    
    No.
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/util/bit_stream_utils_internal.h | 42 +++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h 
b/cpp/src/arrow/util/bit_stream_utils_internal.h
index d8c7317fe8..376de56a9a 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -160,6 +160,10 @@ class BitReader {
   /// are not enough bits left.
   bool Advance(int64_t num_bits);
 
+  /// Advance the stream by a number of bytes, ignoring remaning bits.
+  /// Returns true if succeed or false if there are not enough bits left.
+  bool AdvanceBytes(int num_bytes);
+
   /// Reads a vlq encoded int from the stream.  The encoded int must start at
   /// the beginning of a byte. Return false if there were not enough bytes in
   /// the buffer.
@@ -328,6 +332,17 @@ inline bool BitReader::Advance(int64_t num_bits) {
   return true;
 }
 
+inline bool BitReader::AdvanceBytes(int num_bytes) {
+  if (ARROW_PREDICT_FALSE(num_bytes > max_bytes_ - byte_offset_)) {
+    return false;
+  }
+  byte_offset_ += num_bytes;
+  bit_offset_ = 0;
+  buffered_values_ =
+      detail::ReadLittleEndianWord(buffer_ + byte_offset_, max_bytes_ - 
byte_offset_);
+  return true;
+}
+
 template <typename Int>
 inline bool BitWriter::PutVlqInt(Int v) {
   static_assert(std::is_integral_v<Int>);
@@ -362,25 +377,10 @@ inline bool BitReader::GetVlqInt(Int* v) {
   static_assert(std::is_integral_v<Int>);
 
   // The data that we will pass to the LEB128 parser
-  // In all case, we read a byte-aligned value, skipping remaining bits
-  const uint8_t* data = NULLPTR;
-  int max_size = 0;
-
-  // Number of bytes left in the buffered values, not including the current
-  // byte (i.e., there may be an additional fraction of a byte).
-  const int bytes_left_in_cache =
-      sizeof(buffered_values_) - 
static_cast<int>(bit_util::BytesForBits(bit_offset_));
-
-  // If there are clearly enough bytes left we can try to parse from the cache
-  if (bytes_left_in_cache >= kMaxLEB128ByteLenFor<Int>) {
-    max_size = bytes_left_in_cache;
-    data = reinterpret_cast<const uint8_t*>(&buffered_values_) +
-           bit_util::BytesForBits(bit_offset_);
-    // Otherwise, we try straight from buffer (ignoring few bytes that may be 
cached)
-  } else {
-    max_size = bytes_left();
-    data = buffer_ + (max_bytes_ - max_size);
-  }
+  // We read a byte-aligned value, skipping remaining bits.
+  // Also, we don't bother with the cache since the decoding would be the same.
+  int max_size = bytes_left();
+  const uint8_t* data = buffer_ + (max_bytes_ - max_size);
 
   const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v);
   if (ARROW_PREDICT_FALSE(bytes_read == 0)) {
@@ -388,8 +388,8 @@ inline bool BitReader::GetVlqInt(Int* v) {
     return false;
   }
 
-  // Advance for the bytes we have read + the bits we skipped
-  return Advance((8 * bytes_read) + (bit_offset_ % 8));
+  // Advance for the bytes we have read
+  return AdvanceBytes(bytes_read);
 }
 
 template <typename Int>

(arrow) branch main updated: GH-48245: [C++][Parquet] Simplify GetVlqInt (#48237)

Reply via email to