AntoinePrv commented on code in PR #47896:
URL: https://github.com/apache/arrow/pull/47896#discussion_r2472219047
##########
cpp/src/arrow/util/bpacking_dispatch_internal.h:
##########
@@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int
batch_size) {
/// @tparam UnpackedUInt The type in which we unpack the values.
template <int kPackedBitWidth, template <typename, int> typename Unpacker,
typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
- using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int
bit_offset) {
+ if constexpr (kPackedBitWidth == 0) {
+ // Easy case to handle, simply setting memory to zero.
+ return unpack_null(in, out, batch_size);
+ } else {
+ // In case of misalignment, we need to run the prolog until aligned.
+ int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size,
bit_offset);
+ // We either extracted everything or found a alignment
+ const int start_bit = extracted * kPackedBitWidth + bit_offset;
+ ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
+ batch_size -= extracted;
+ ARROW_DCHECK_GE(batch_size, 0);
+ in += start_bit / 8;
+ out += extracted;
- constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
- batch_size = batch_size / kValuesUnpacked * kValuesUnpacked;
- int num_loops = batch_size / kValuesUnpacked;
+ if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) {
+ // Only memcpy / static_cast
+ return unpack_full(in, out, batch_size);
+ } else {
+ using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+ constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
- for (int i = 0; i < num_loops; ++i) {
- in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
- }
+ // Running the optimized kernel for batch extraction
+ const int unpacker_iter_count = batch_size / kValuesUnpacked;
+ for (int i = 0; i < unpacker_iter_count; ++i) {
+ in = UnpackerForWidth::unpack(in, out);
+ out += kValuesUnpacked;
+ }
+ batch_size -= unpacker_iter_count * kValuesUnpacked;
- return batch_size;
+ // Running the epilog for the remaining values that don't fit in a kernel
+ ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
+ ARROW_DCHECK_GE(batch_size, 0);
+ ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
+ ARROW_COMPILER_ASSUME(batch_size >= 0);
+ unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset=
*/ 0);
Review Comment:
This one I thought about, but it would require passing another parameter to
know where the input buffer ends (regardless of the `batch_size`). We could
also know where the output buffer ends and unpack to even skip the local buffer.
I think we should investigate this and only do it if there is a benefit.
Though that must be after the (hopefully) changes to the SIMD shuffles because
they change the size of the iterations.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]