Re: [PR] GH-47895: [C++][Parquet] Add prolog and epilog in unpack [arrow]

via GitHub Tue, 28 Oct 2025 08:54:57 -0700


pitrou commented on code in PR #47896:
URL: https://github.com/apache/arrow/pull/47896#discussion_r2470074936



##########
cpp/src/arrow/util/bpacking_dispatch_internal.h:
##########
@@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int 
batch_size) {
 /// @tparam UnpackedUInt The type in which we unpack the values.
 template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
-  using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int 
bit_offset) {
+  if constexpr (kPackedBitWidth == 0) {
+    // Easy case to handle, simply setting memory to zero.
+    return unpack_null(in, out, batch_size);
+  } else {
+    // In case of misalignment, we need to run the prolog until aligned.
+    int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size, 
bit_offset);
+    // We either extracted everything or found a alignment
+    const int start_bit = extracted * kPackedBitWidth + bit_offset;
+    ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
+    batch_size -= extracted;
+    ARROW_DCHECK_GE(batch_size, 0);
+    in += start_bit / 8;
+    out += extracted;
 
-  constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
-  batch_size = batch_size / kValuesUnpacked * kValuesUnpacked;
-  int num_loops = batch_size / kValuesUnpacked;
+    if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) {
+      // Only memcpy / static_cast
+      return unpack_full(in, out, batch_size);
+    } else {
+      using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+      constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
 
-  for (int i = 0; i < num_loops; ++i) {
-    in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
-  }
+      // Running the optimized kernel for batch extraction
+      const int unpacker_iter_count = batch_size / kValuesUnpacked;
+      for (int i = 0; i < unpacker_iter_count; ++i) {
+        in = UnpackerForWidth::unpack(in, out);
+        out += kValuesUnpacked;
+      }
+      batch_size -= unpacker_iter_count * kValuesUnpacked;
 
-  return batch_size;
+      // Running the epilog for the remaining values that don't fit in a kernel
+      ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
+      ARROW_DCHECK_GE(batch_size, 0);
+      ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
+      ARROW_COMPILER_ASSUME(batch_size >= 0);
+      unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset= 
*/ 0);

Review Comment:
   Similarly, if there's enough padding at the end of the input, we could 
SIMD-unpack a full `kValuesUnpacked` into a local buffer.



##########
cpp/src/arrow/util/bpacking_dispatch_internal.h:
##########
@@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int 
batch_size) {
 /// @tparam UnpackedUInt The type in which we unpack the values.
 template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
-  using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int 
bit_offset) {
+  if constexpr (kPackedBitWidth == 0) {
+    // Easy case to handle, simply setting memory to zero.
+    return unpack_null(in, out, batch_size);
+  } else {
+    // In case of misalignment, we need to run the prolog until aligned.
+    int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size, 
bit_offset);
+    // We either extracted everything or found a alignment
+    const int start_bit = extracted * kPackedBitWidth + bit_offset;
+    ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
+    batch_size -= extracted;
+    ARROW_DCHECK_GE(batch_size, 0);
+    in += start_bit / 8;
+    out += extracted;
 
-  constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
-  batch_size = batch_size / kValuesUnpacked * kValuesUnpacked;
-  int num_loops = batch_size / kValuesUnpacked;
+    if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) {
+      // Only memcpy / static_cast
+      return unpack_full(in, out, batch_size);
+    } else {
+      using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+      constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
 
-  for (int i = 0; i < num_loops; ++i) {
-    in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
-  }
+      // Running the optimized kernel for batch extraction
+      const int unpacker_iter_count = batch_size / kValuesUnpacked;
+      for (int i = 0; i < unpacker_iter_count; ++i) {
+        in = UnpackerForWidth::unpack(in, out);
+        out += kValuesUnpacked;
+      }
+      batch_size -= unpacker_iter_count * kValuesUnpacked;
 
-  return batch_size;
+      // Running the epilog for the remaining values that don't fit in a kernel
+      ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
+      ARROW_DCHECK_GE(batch_size, 0);
+      ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
+      ARROW_COMPILER_ASSUME(batch_size >= 0);
+      unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset= 
*/ 0);
+    }
+  }
 }
 
 template <template <typename, int> typename Unpacker, typename UnpackedUint>
-static int unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
-                       int num_bits) {
+static void unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
+                        int num_bits, int bit_offset) {
   if constexpr (std::is_same_v<UnpackedUint, bool>) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);

Review Comment:
   Ok, macros are not pretty, but we _could_ have a macro here to minimize 
diffs when changing these function signatures :-)
   
   Such as:
   ```c++
   #define CASE_UNPACK_WIDTH(_width) \
           return unpack_width<_width, Unpacker>(in, out, batch_size, 
bit_offset)
   
     if constexpr (std::is_same_v<UnpackedUint, bool>) {
       switch (num_bits) {
         case 0:
           CASE_UNPACK_WIDTH(0);
     // etc.
   
   #undef CASE_UNPACK_WIDTH
   ```
   
   As you prefer, though.



##########
cpp/src/arrow/util/bpacking_test.cc:
##########
@@ -35,124 +34,112 @@
 namespace arrow::internal {
 
 template <typename Int>
-using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
-Result<int32_t> GetNumBytes(int32_t num_values, int32_t bit_width) {
-  const auto num_bits = num_values * bit_width;
-  if (num_bits % 8 != 0) {
-    return Status::NotImplemented(
-        "The unpack functions only work on a multiple of 8 bits.");
-  }
-  return num_bits / 8;
+int GetNumBytes(int num_values, int bit_width, int bit_offset) {
+  return static_cast<int>(bit_util::BytesForBits(num_values * bit_width + 
bit_offset));
 }
 
-/// Generate random bytes as packed integers.
-std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t 
bit_width) {
+/// Generate random values that can be packed within the given bit width.
+template <typename Uint>
+std::vector<Uint> GenerateRandomValuesForPacking(int num_values, int 
bit_width) {
   constexpr uint32_t kSeed = 3214;
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, 
bit_width));
 
-  std::vector<uint8_t> out(std::max(1, num_bytes));  // We need a valid 
pointer for size 0
-  random_bytes(num_bytes, kSeed, out.data());
+  num_values = std::max(1, num_values);  // We need a valid pointer for size 0
+  std::vector<Uint> out(num_values);
+
+  if (bit_width == 0) {
+    return out;
+  }
 
+  if constexpr (std::is_same_v<Uint, bool>) {
+    random_is_valid(num_values, 0.5, &out, kSeed);
+  } else {
+    const uint64_t max = (uint64_t{1} << (static_cast<uint64_t>(bit_width) - 
1)) - 1;

Review Comment:
   Shouldn't this be
   ```suggestion
       const uint64_t max = (uint64_t{1} << static_cast<uint64_t>(bit_width)) - 
1;
   ```
   
   (e.g. you want `2**10 - 1` for 10-bit packing, not `2**9 - 1`)
   



##########
cpp/src/arrow/util/bpacking_dispatch_internal.h:
##########
@@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int 
batch_size) {
 /// @tparam UnpackedUInt The type in which we unpack the values.
 template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
-  using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int 
bit_offset) {
+  if constexpr (kPackedBitWidth == 0) {
+    // Easy case to handle, simply setting memory to zero.
+    return unpack_null(in, out, batch_size);
+  } else {
+    // In case of misalignment, we need to run the prolog until aligned.

Review Comment:
   As a TODO, if `batch_size` is large enough, we can perhaps rewind to the 
last byte-aligned packed and SIMD-unpack `kValuesUnpacked` into a local buffer, 
instead of going through `unpack_exact`.
   
   (this seems lower-priority than SIMD shuffling, though)



##########
cpp/src/arrow/util/bpacking_test.cc:
##########
@@ -35,124 +34,112 @@
 namespace arrow::internal {
 
 template <typename Int>
-using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
-Result<int32_t> GetNumBytes(int32_t num_values, int32_t bit_width) {
-  const auto num_bits = num_values * bit_width;
-  if (num_bits % 8 != 0) {
-    return Status::NotImplemented(
-        "The unpack functions only work on a multiple of 8 bits.");
-  }
-  return num_bits / 8;
+int GetNumBytes(int num_values, int bit_width, int bit_offset) {
+  return static_cast<int>(bit_util::BytesForBits(num_values * bit_width + 
bit_offset));
 }
 
-/// Generate random bytes as packed integers.
-std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t 
bit_width) {
+/// Generate random values that can be packed within the given bit width.
+template <typename Uint>
+std::vector<Uint> GenerateRandomValuesForPacking(int num_values, int 
bit_width) {
   constexpr uint32_t kSeed = 3214;
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, 
bit_width));
 
-  std::vector<uint8_t> out(std::max(1, num_bytes));  // We need a valid 
pointer for size 0
-  random_bytes(num_bytes, kSeed, out.data());
+  num_values = std::max(1, num_values);  // We need a valid pointer for size 0
+  std::vector<Uint> out(num_values);
+
+  if (bit_width == 0) {
+    return out;
+  }
 
+  if constexpr (std::is_same_v<Uint, bool>) {
+    random_is_valid(num_values, 0.5, &out, kSeed);
+  } else {
+    const uint64_t max = (uint64_t{1} << (static_cast<uint64_t>(bit_width) - 
1)) - 1;
+    rand_uniform_int(out.size(), kSeed, /* min= */ decltype(max){0}, max, 
out.data());
+  }
   return out;
 }
 
 /// Convenience wrapper to unpack into a vector
 template <typename Int>
 std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
-                              int32_t bit_width, UnpackFunc<Int> unpack) {
-  // Using dynamic array to avoid std::vector<bool>
-  auto buffer = std::make_unique<Int[]>(num_values);
-  int values_read = unpack(packed, buffer.get(), num_values, bit_width);
-  ARROW_DCHECK_GE(values_read, 0);
-  EXPECT_LE(values_read, num_values);
-
-  return std::vector<Int>(buffer.get(), buffer.get() + values_read);
+                              int32_t bit_width, int32_t bit_offset,
+                              UnpackFunc<Int> unpack) {
+  if constexpr (std::is_same_v<Int, bool>) {
+    // Using dynamic array to avoid std::vector<bool>
+    auto buffer = std::make_unique<Int[]>(num_values);
+    unpack(packed, buffer.get(), num_values, bit_width, bit_offset);
+    return std::vector<Int>(buffer.get(), buffer.get() + num_values);
+  } else {
+    std::vector<Int> out(num_values);
+    unpack(packed, out.data(), num_values, bit_width, bit_offset);
+    return out;
+  }
 }
 
 /// Use BitWriter to pack values into a vector.
 template <typename Int>
-std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t 
num_values,
-                                int32_t bit_width) {
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, 
bit_width));
+std::vector<uint8_t> PackValues(const std::vector<Int>& values, int num_values,
+                                int bit_width, int bit_offset) {
+  if (bit_width == 0) {
+    return {};
+  }
+
+  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
   std::vector<uint8_t> out(static_cast<std::size_t>(num_bytes));
   bit_util::BitWriter writer(out.data(), num_bytes);
-  for (const auto& v : values) {
-    bool written = writer.PutValue(v, bit_width);
-    if (!written) {
-      throw std::runtime_error("Cannot write move values");
-    }
-  }
 
-  return out;
-}
-
-template <typename Int>
-void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values,
-                              int32_t bit_width, UnpackFunc<Int> unpack) {
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, 
bit_width));
-
-  const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack);
-  EXPECT_EQ(unpacked.size(), num_values);
-  const auto roundtrip = PackValues(unpacked, num_values, bit_width);
-  EXPECT_EQ(num_bytes, roundtrip.size());
-  for (int i = 0; i < num_bytes; ++i) {
-    EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
+  // Write a first 0 value to make an offset
+  bool written = writer.PutValue(0, bit_offset);
+  for (const auto& v : values) {
+    written &= writer.PutValue(v, bit_width);
   }
-}
-
-const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
-  auto addr = reinterpret_cast<std::uintptr_t>(ptr);
 
-  if (addr % alignment == 0) {
-    return ptr;
+  if (!written) {
+    throw std::runtime_error("Cannot write move values");
   }

Review Comment:
   Let's avoid exceptions, you could for example have this function return a 
`Result<std::vector<uint8_t>>`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47895: [C++][Parquet] Add prolog and epilog in unpack [arrow]

Reply via email to