Re: [PR] GH-48202: [C++][Parquet] Fix encoder & decoder logic to enable Parque… [arrow]

via GitHub Tue, 25 Nov 2025 04:23:18 -0800


Vishwanatha-HD commented on code in PR #48203:
URL: https://github.com/apache/arrow/pull/48203#discussion_r2558667791



##########
cpp/src/parquet/decoder.cc:
##########
@@ -1616,9 +1647,17 @@ class DeltaBitPackDecoder : public 
TypedDecoderImpl<DType> {
       for (int j = 0; j < values_decode; ++j) {
         // Addition between min_delta, packed int and last_value should be 
treated as
         // unsigned addition. Overflow is as expected.
+#if ARROW_LITTLE_ENDIAN
         buffer[i + j] = static_cast<UT>(min_delta_) + static_cast<UT>(buffer[i 
+ j]) +
                         static_cast<UT>(last_value_);
         last_value_ = buffer[i + j];
+#else

Review Comment:
   @pitrou.. If you see the piece of code added for s390x, its as below.. 
           UT temp = static_cast<UT>(min_delta_) + 
static_cast<UT>(static_cast<uint64_t>(buffer[i + j])) + 
static_cast<UT>(last_value_);
           buffer[i + j] = static_cast<T>(temp);
           last_value_ = static_cast<T>(temp);
   
   If you see the second value and the way we are trying to cast the 
buffer[i+j], that is the main difference.. And, its needed to be done on 
s390x.. 



##########
cpp/src/parquet/decoder.cc:
##########
@@ -458,7 +470,24 @@ inline int DecodePlain(const uint8_t* data, int64_t 
data_size, int num_values,
   }
   // If bytes_to_decode == 0, data could be null
   if (bytes_to_decode > 0) {
+#if ARROW_LITTLE_ENDIAN
     memcpy(out, data, static_cast<size_t>(bytes_to_decode));
+#else
+    // On big-endian systems, we need to byte-swap each value
+    // since Parquet data is stored in little-endian format.
+    // Only apply to integer and floating-point types that have 
FromLittleEndian support.
+    if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
+                  std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
+                  std::is_same_v<T, float> || std::is_same_v<T, double>) {
+      for (int i = 0; i < num_values; ++i) {
+        out[i] = ::arrow::bit_util::FromLittleEndian(SafeLoadAs<T>(data));
+        data += sizeof(T);
+      }
+    } else {
+      // For other types (bool, Int96, etc.), just do memcpy

Review Comment:
   @pitrou.. Yes.. I am sure about Int96 type here.. 



##########
cpp/src/parquet/decoder.cc:
##########
@@ -458,7 +470,24 @@ inline int DecodePlain(const uint8_t* data, int64_t 
data_size, int num_values,
   }
   // If bytes_to_decode == 0, data could be null
   if (bytes_to_decode > 0) {
+#if ARROW_LITTLE_ENDIAN
     memcpy(out, data, static_cast<size_t>(bytes_to_decode));
+#else
+    // On big-endian systems, we need to byte-swap each value
+    // since Parquet data is stored in little-endian format.
+    // Only apply to integer and floating-point types that have 
FromLittleEndian support.
+    if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
+                  std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
+                  std::is_same_v<T, float> || std::is_same_v<T, double>) {

Review Comment:
   @pitrou.. Yes.. I initially remember using is_arithmetic_v<T> itself.. WIth 
that the testcases for bool type didnt get fixed.. Hence, I had to handle each 
of the data types individually.. 



##########
cpp/src/parquet/decoder.cc:
##########
@@ -2306,6 +2345,17 @@ class ByteStreamSplitDecoder<FLBAType> : public 
ByteStreamSplitDecoderBase<FLBAT
     const int num_decoded = this->DecodeRaw(decode_out, max_values);
     DCHECK_EQ(num_decoded, max_values);
 
+#if !ARROW_LITTLE_ENDIAN
+    // On big-endian, ByteStreamSplitDecode (DoMergeStreams) reverses stream 
positions
+    // to produce numeric values in native byte order. For FLBA (opaque byte 
arrays),
+    // we need to undo this reversal to preserve the original byte sequence.

Review Comment:
   @pitrou.. I will work on this in the next pass as I tried making the 
changes, but its somehow resulting in testcase failures. May be I missed 
something while doing the required code changes.



##########
cpp/src/parquet/encoder.cc:
##########
@@ -237,6 +259,32 @@ void DirectPutImpl(const ::arrow::Array& values, 
::arrow::BufferBuilder* sink) {
       }
     }
   }
+#else

Review Comment:
   @pitrou.. I have made the code changes to remove the duplicate code paths.. 
I have pushed the changes.. thanks..



##########
cpp/src/parquet/decoder.cc:
##########
@@ -406,9 +407,20 @@ int PlainDecoder<DType>::DecodeArrow(
       VisitBitRuns(valid_bits, valid_bits_offset, num_values,
                    [&](int64_t position, int64_t run_length, bool is_valid) {
                      if (is_valid) {
+#if ARROW_LITTLE_ENDIAN
                        RETURN_NOT_OK(builder->AppendValues(
                            reinterpret_cast<const value_type*>(data), 
run_length));
                        data += run_length * sizeof(value_type);
+#else
+                       // On big-endian systems, we need to byte-swap each 
value
+                       // since Parquet data is stored in little-endian format

Review Comment:
   @pitrou.. I will definitely try if I can replace this part of code with 
AppendValues.. Thanks.. 



##########
cpp/src/parquet/decoder.cc:
##########
@@ -458,7 +470,24 @@ inline int DecodePlain(const uint8_t* data, int64_t 
data_size, int num_values,
   }
   // If bytes_to_decode == 0, data could be null
   if (bytes_to_decode > 0) {
+#if ARROW_LITTLE_ENDIAN
     memcpy(out, data, static_cast<size_t>(bytes_to_decode));
+#else
+    // On big-endian systems, we need to byte-swap each value
+    // since Parquet data is stored in little-endian format.
+    // Only apply to integer and floating-point types that have 
FromLittleEndian support.
+    if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
+                  std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
+                  std::is_same_v<T, float> || std::is_same_v<T, double>) {

Review Comment:
   @pitrou..
   Just to make the code look better, I have modifed the if condition & the 
corresponding comment, like this below.. 
   I tested this change and it works fine.. I have pushed this change as well.. 
   
   // On big-endian systems, except for bool type, we need to byte-swap each 
value,
   if constexpr (std::is_arithmetic_v<T> && !(std::is_same_v<T, bool>)) {



##########
cpp/src/parquet/encoder.cc:
##########
@@ -201,7 +202,27 @@ class PlainEncoder : public EncoderImpl, virtual public 
TypedEncoder<DType> {
 template <typename DType>
 void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
   if (num_values > 0) {
+#if ARROW_LITTLE_ENDIAN
     PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+#else
+    // On big-endian systems, we need to byte-swap each value
+    // since Parquet data must be stored in little-endian format.
+    if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
+                  std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
+                  std::is_same_v<T, float> || std::is_same_v<T, double>) {
+      PARQUET_ASSIGN_OR_THROW(
+          auto temp_buffer,
+          ::arrow::AllocateBuffer(num_values * sizeof(T), 
this->memory_pool()));

Review Comment:
   @pitrou.. Thanks for your comments.. I have modified the code, tested it and 
pushed it to upstream.. Thanks.. 



##########
cpp/src/parquet/encoder.cc:
##########
@@ -964,10 +1033,22 @@ class ByteStreamSplitEncoder<FLBAType> : public 
ByteStreamSplitEncoderBase<FLBAT
     if (byte_width_ > 0) {
       const int64_t total_bytes = static_cast<int64_t>(num_values) * 
byte_width_;
       PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+#if !ARROW_LITTLE_ENDIAN
+      // On big-endian, reverse bytes before encoding to compensate for
+      // DoSplitStreams reversal, ensuring FLBA bytes are preserved as-is

Review Comment:
   @pitrou.. I will take care of this change in the next pass. Thanks..



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48202: [C++][Parquet] Fix encoder & decoder logic to enable Parque… [arrow]

Reply via email to