This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1044022d59 GH-48218: [C++][Parquet] Fix Util & Level Conversion logic
on big-endian (#48219)
1044022d59 is described below
commit 1044022d596cc334eb5662959fbea0120590b623
Author: Vishwanatha-HD <[email protected]>
AuthorDate: Mon Nov 24 17:11:49 2025 +0530
GH-48218: [C++][Parquet] Fix Util & Level Conversion logic on big-endian
(#48219)
### Rationale for this change
This PR is intended to enable Parquet DB support on Big-endian (s390x)
systems. The fix in this PR fixes the "util & level_conversion" logic.
### What changes are included in this PR?
The fix includes changes to following files:
cpp/src/parquet/level_conversion_inc.h
cpp/src/parquet/test_util.h
### Are these changes tested?
Yes. The changes are tested on s390x arch to make sure things are working
fine. The fix is also tested on x86 arch, to make sure there is no new
regression introduced.
### Are there any user-facing changes?
No.
GitHub main Issue link: https://github.com/apache/arrow/issues/48151
* GitHub Issue: #48218
Authored-by: Vishwanatha-HD <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/geospatial/util_internal.cc | 2 +-
cpp/src/parquet/level_conversion_inc.h | 9 +++++----
cpp/src/parquet/test_util.h | 6 ++++--
3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/cpp/src/parquet/geospatial/util_internal.cc
b/cpp/src/parquet/geospatial/util_internal.cc
index 4991d58a13..d5c8d66288 100644
--- a/cpp/src/parquet/geospatial/util_internal.cc
+++ b/cpp/src/parquet/geospatial/util_internal.cc
@@ -162,7 +162,7 @@ void
WKBGeometryBounder::MergeGeometry(::arrow::util::span<const uint8_t> bytes_
void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool
record_wkb_type) {
uint8_t endian = src->ReadUInt8();
-#if defined(ARROW_LITTLE_ENDIAN)
+#if ARROW_LITTLE_ENDIAN
bool swap = endian != 0x01;
#else
bool swap = endian != 0x00;
diff --git a/cpp/src/parquet/level_conversion_inc.h
b/cpp/src/parquet/level_conversion_inc.h
index 5fce93e779..335f5b9215 100644
--- a/cpp/src/parquet/level_conversion_inc.h
+++ b/cpp/src/parquet/level_conversion_inc.h
@@ -299,14 +299,15 @@ int64_t DefLevelsBatchToBitmap(const int16_t* def_levels,
const int64_t batch_si
ARROW_DCHECK_LE(batch_size, kExtractBitsSize);
// Greater than level_info.def_level - 1 implies >= the def_level
- auto defined_bitmap = static_cast<extract_bitmap_t>(
- internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level
- 1));
+ auto defined_bitmap =
static_cast<extract_bitmap_t>(::arrow::bit_util::FromLittleEndian(
+ internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level
- 1)));
if (has_repeated_parent) {
// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
// repeated_ancestor_def_level
- auto present_bitmap =
static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
- def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
+ auto present_bitmap = static_cast<extract_bitmap_t>(
+ ::arrow::bit_util::FromLittleEndian(internal::GreaterThanBitmap(
+ def_levels, batch_size, level_info.repeated_ancestor_def_level -
1)));
auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap);
if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index 3ed9a1a007..9271dc290c 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -33,6 +33,7 @@
#include "arrow/extension_type.h"
#include "arrow/io/memory.h"
#include "arrow/testing/util.h"
+#include "arrow/util/endian.h"
#include "arrow/util/float16.h"
#include "parquet/column_page.h"
@@ -319,8 +320,9 @@ class DataPageBuilder {
encoder.Encode(static_cast<int>(levels.size()), levels.data());
int32_t rle_bytes = encoder.len();
+ int32_t rle_bytes_le = ::arrow::bit_util::ToLittleEndian(rle_bytes);
PARQUET_THROW_NOT_OK(
- sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes),
sizeof(int32_t)));
+ sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes_le),
sizeof(int32_t)));
PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
}
};
@@ -835,7 +837,7 @@ inline void GenerateData<FLBA>(int num_values, FLBA* out,
std::vector<uint8_t>*
// ----------------------------------------------------------------------
// Test utility functions for geometry
-#if defined(ARROW_LITTLE_ENDIAN)
+#if ARROW_LITTLE_ENDIAN
static constexpr uint8_t kWkbNativeEndianness = 0x01;
#else
static constexpr uint8_t kWkbNativeEndianness = 0x00;