kou commented on code in PR #48195:
URL: https://github.com/apache/arrow/pull/48195#discussion_r2551139272


##########
cpp/src/arrow/util/bit_stream_utils_internal.h:
##########
@@ -381,6 +384,17 @@ inline bool BitReader::GetVlqInt(Int* v) {
     max_size = bytes_left();
     data = buffer_ + (max_bytes_ - max_size);
   }
+#else
+  // For VLQ reading, always read directly from buffer to avoid endianness 
issues
+  // with buffered_values_ on big-endian systems like s390x

Review Comment:
   ```suggestion
     // with buffered_values_ on big-endian systems like s390x.
   ```



##########
cpp/src/arrow/util/bit_stream_utils_internal.h:
##########
@@ -365,6 +365,9 @@ inline bool BitReader::GetVlqInt(Int* v) {
   // In all case, we read a byte-aligned value, skipping remaining bits
   const uint8_t* data = NULLPTR;
   int max_size = 0;
+#if ARROW_LITTLE_ENDIAN
+  // The data that we will pass to the LEB128 parser

Review Comment:
   ```suggestion
     // The data that we will pass to the LEB128 parser.
   ```



##########
testing:
##########


Review Comment:
   Could you revert this change?
   (This is needless, right?)



##########
cpp/src/arrow/util/bit_stream_utils_internal.h:
##########
@@ -381,6 +384,17 @@ inline bool BitReader::GetVlqInt(Int* v) {
     max_size = bytes_left();
     data = buffer_ + (max_bytes_ - max_size);
   }
+#else
+  // For VLQ reading, always read directly from buffer to avoid endianness 
issues
+  // with buffered_values_ on big-endian systems like s390x
+  // Calculate current position in buffer accounting for bit offset

Review Comment:
   ```suggestion
     // Calculate current position in buffer accounting for bit offset.
   ```



##########
cpp/src/arrow/util/bit_stream_utils_internal.h:
##########
@@ -381,6 +384,17 @@ inline bool BitReader::GetVlqInt(Int* v) {
     max_size = bytes_left();
     data = buffer_ + (max_bytes_ - max_size);
   }
+#else
+  // For VLQ reading, always read directly from buffer to avoid endianness 
issues
+  // with buffered_values_ on big-endian systems like s390x
+  // Calculate current position in buffer accounting for bit offset
+  const int current_byte_offset = byte_offset_ + 
bit_util::BytesForBits(bit_offset_);
+  const int bytes_left_in_buffer = max_bytes_ - current_byte_offset;
+
+  // Always read from buffer directly to avoid endianness issues
+  data = buffer_ + current_byte_offset;
+  max_size = bytes_left_in_buffer;

Review Comment:
   Does this the same logic as 
https://github.com/apache/arrow/blob/2fb2f79a18962875d7f6adc5115666fc0bfea342/cpp/src/arrow/util/bit_stream_utils_internal.h#L380-L383
 ?
   
   If so, should we reuse it something like the following?
   
   ```diff
   diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h 
b/cpp/src/arrow/util/bit_stream_utils_internal.h
   index d8c7317fe8..7352312782 100644
   --- a/cpp/src/arrow/util/bit_stream_utils_internal.h
   +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
   @@ -366,6 +366,9 @@ inline bool BitReader::GetVlqInt(Int* v) {
      const uint8_t* data = NULLPTR;
      int max_size = 0;
    
   +#if ARROW_LITTLE_ENDIAN
   +  // TODO: Describe why we need this only for little-endian.
   +
      // Number of bytes left in the buffered values, not including the current
      // byte (i.e., there may be an additional fraction of a byte).
      const int bytes_left_in_cache =
   @@ -377,7 +380,9 @@ inline bool BitReader::GetVlqInt(Int* v) {
        data = reinterpret_cast<const uint8_t*>(&buffered_values_) +
               bit_util::BytesForBits(bit_offset_);
        // Otherwise, we try straight from buffer (ignoring few bytes that may 
be cached)
   -  } else {
   +  } else
   +#endif
   +  {
        max_size = bytes_left();
        data = buffer_ + (max_bytes_ - max_size);
      }
   ```



##########
cpp/src/arrow/util/bit_stream_utils_internal.h:
##########
@@ -365,6 +365,9 @@ inline bool BitReader::GetVlqInt(Int* v) {
   // In all case, we read a byte-aligned value, skipping remaining bits
   const uint8_t* data = NULLPTR;
   int max_size = 0;
+#if ARROW_LITTLE_ENDIAN
+  // The data that we will pass to the LEB128 parser
+  // In all case, we read a byte-aligned value, skipping remaining bits

Review Comment:
   ```suggestion
     // In all case, we read a byte-aligned value, skipping remaining bits.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to