pitrou commented on code in PR #47896:
URL: https://github.com/apache/arrow/pull/47896#discussion_r2472287915
##########
cpp/src/arrow/util/bpacking_dispatch_internal.h:
##########
@@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int
batch_size) {
/// @tparam UnpackedUInt The type in which we unpack the values.
template <int kPackedBitWidth, template <typename, int> typename Unpacker,
typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
- using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int
bit_offset) {
+ if constexpr (kPackedBitWidth == 0) {
+ // Easy case to handle, simply setting memory to zero.
+ return unpack_null(in, out, batch_size);
+ } else {
+ // In case of misalignment, we need to run the prolog until aligned.
+ int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size,
bit_offset);
+ // We either extracted everything or found a alignment
+ const int start_bit = extracted * kPackedBitWidth + bit_offset;
+ ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
+ batch_size -= extracted;
+ ARROW_DCHECK_GE(batch_size, 0);
+ in += start_bit / 8;
+ out += extracted;
- constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
- batch_size = batch_size / kValuesUnpacked * kValuesUnpacked;
- int num_loops = batch_size / kValuesUnpacked;
+ if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) {
+ // Only memcpy / static_cast
+ return unpack_full(in, out, batch_size);
+ } else {
+ using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+ constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
- for (int i = 0; i < num_loops; ++i) {
- in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
- }
+ // Running the optimized kernel for batch extraction
+ const int unpacker_iter_count = batch_size / kValuesUnpacked;
+ for (int i = 0; i < unpacker_iter_count; ++i) {
+ in = UnpackerForWidth::unpack(in, out);
+ out += kValuesUnpacked;
+ }
+ batch_size -= unpacker_iter_count * kValuesUnpacked;
- return batch_size;
+ // Running the epilog for the remaining values that don't fit in a kernel
+ ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
+ ARROW_DCHECK_GE(batch_size, 0);
+ ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
+ ARROW_COMPILER_ASSUME(batch_size >= 0);
+ unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset=
*/ 0);
Review Comment:
Well, if we want to use SIMD shuffles maximally, we probably want to pass
the input buffer end indeed.
But that can be done later.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]