This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9e177dc ARROW-4541: [C++] Support slices for SumKernel
9e177dc is described below
commit 9e177dc4214685a78a00170bfb37a34d6dd37007
Author: François Saint-Jacques <[email protected]>
AuthorDate: Mon Feb 18 20:22:16 2019 -0600
ARROW-4541: [C++] Support slices for SumKernel
The original implementation assumed that the bitmap buffer was byte
aligned. This commits removes this requirement. The benchmark does not show any
regression, the main loop in ConsumeSparse is properly inlined.
Author: François Saint-Jacques <[email protected]>
Closes #3661 from fsaintjacques/ARROW-4531-fix-sum-slice and squashes the
following commits:
262df4f1c <François Saint-Jacques> Fix bad merge.
d484ccacf <François Saint-Jacques> ARROW-4541: Support all slices for
SumKernel
---
cpp/src/arrow/compute/kernels/aggregate-test.cc | 37 +++++-
cpp/src/arrow/compute/kernels/sum.cc | 144 +++++++++++++++++-------
cpp/src/arrow/util/bit-util.h | 1 +
3 files changed, 139 insertions(+), 43 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/aggregate-test.cc
b/cpp/src/arrow/compute/kernels/aggregate-test.cc
index d522e1b..ca44744 100644
--- a/cpp/src/arrow/compute/kernels/aggregate-test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate-test.cc
@@ -96,14 +96,17 @@ static Datum DummySum(const Array& array) {
typename SumType::c_type sum = 0;
int64_t count = 0;
+ auto data = array.data();
+ internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
array.length());
const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);
const auto values = array_numeric.raw_values();
- const auto bitmap = array.null_bitmap_data();
for (int64_t i = 0; i < array.length(); i++) {
- if (BitUtil::GetBit(bitmap, i)) {
+ if (reader.IsSet()) {
sum += values[i];
count++;
}
+
+ reader.Next();
}
if (count > 0) {
@@ -130,6 +133,9 @@ TYPED_TEST(TestSumKernelNumeric, SimpleSum) {
ValidateSum<TypeParam>(&this->ctx_, "[]",
Datum(std::make_shared<ScalarType>(0, false)));
+ ValidateSum<TypeParam>(&this->ctx_, "[null]",
+ Datum(std::make_shared<ScalarType>(0, false)));
+
ValidateSum<TypeParam>(&this->ctx_, "[0, 1, 2, 3, 4, 5]",
Datum(std::make_shared<ScalarType>(static_cast<T>(5 *
6 / 2))));
@@ -144,10 +150,10 @@ class TestRandomSumKernelNumeric : public ComputeFixture,
public TestBase {};
TYPED_TEST_CASE(TestRandomSumKernelNumeric, NumericArrowTypes);
TYPED_TEST(TestRandomSumKernelNumeric, RandomArraySum) {
auto rand = random::RandomArrayGenerator(0x5487655);
- for (size_t i = 5; i < 14; i++) {
+ for (size_t i = 3; i < 14; i++) {
for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) {
- for (auto length_offset : {-2, -1, 0, 1, 2}) {
- int64_t length = (1UL << i) + length_offset;
+ for (auto length_adjust : {-2, -1, 0, 1, 2}) {
+ int64_t length = (1UL << i) + length_adjust;
auto array = rand.Numeric<TypeParam>(length, 0, 100, null_probability);
ValidateSum<TypeParam>(&this->ctx_, *array);
}
@@ -155,5 +161,26 @@ TYPED_TEST(TestRandomSumKernelNumeric, RandomArraySum) {
}
}
+TYPED_TEST(TestRandomSumKernelNumeric, RandomSliceArraySum) {
+ auto arithmetic = ArrayFromJSON(TypeTraits<TypeParam>::type_singleton(),
+ "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]");
+ ValidateSum<TypeParam>(&this->ctx_, *arithmetic);
+ for (size_t i = 1; i < 15; i++) {
+ auto slice = arithmetic->Slice(i, 16);
+ ValidateSum<TypeParam>(&this->ctx_, *slice);
+ }
+
+ // Trigger ConsumeSparse with different slice offsets.
+ auto rand = random::RandomArrayGenerator(0xfa432643);
+ const int64_t length = 1U << 6;
+ auto array = rand.Numeric<TypeParam>(length, 0, 10, 0.5);
+ for (size_t i = 1; i < 16; i++) {
+ for (size_t j = 1; i < 16; i++) {
+ auto slice = array->Slice(i, length - j);
+ ValidateSum<TypeParam>(&this->ctx_, *slice);
+ }
+ }
+}
+
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/sum.cc
b/cpp/src/arrow/compute/kernels/sum.cc
index a1487c1..1799941 100644
--- a/cpp/src/arrow/compute/kernels/sum.cc
+++ b/cpp/src/arrow/compute/kernels/sum.cc
@@ -39,7 +39,7 @@ struct SumState {
ThisType& operator+=(const ThisType& rhs) {
this->count += rhs.count;
- this->sum += this->sum;
+ this->sum += rhs.sum;
return *this;
}
@@ -53,19 +53,38 @@ struct SumState {
typename SumType::c_type sum = 0;
};
+constexpr int64_t CoveringBytes(int64_t offset, int64_t length) {
+ return (BitUtil::RoundUp(length + offset, 8) - BitUtil::RoundDown(offset,
8)) / 8;
+}
+
+static_assert(CoveringBytes(0, 8) == 1, "");
+static_assert(CoveringBytes(0, 9) == 2, "");
+static_assert(CoveringBytes(1, 7) == 1, "");
+static_assert(CoveringBytes(1, 8) == 2, "");
+static_assert(CoveringBytes(2, 19) == 3, "");
+static_assert(CoveringBytes(7, 18) == 4, "");
+
template <typename ArrowType, typename StateType = SumState<ArrowType>>
class SumAggregateFunction final : public
AggregateFunctionStaticState<StateType> {
using CType = typename TypeTraits<ArrowType>::CType;
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ static constexpr int64_t kTinyThreshold = 32;
+ static_assert(kTinyThreshold > 18,
+ "ConsumeSparse requires at least 18 elements to fit 3 bytes");
+
public:
Status Consume(const Array& input, StateType* state) const override {
const ArrayType& array = static_cast<const ArrayType&>(input);
- if (input.null_count() > 0) {
- *state = ConsumeSparse(array);
- } else {
+ if (input.null_count() == 0) {
*state = ConsumeDense(array);
+ } else if (input.length() <= kTinyThreshold) {
+ // In order to simplify ConsumeSparse implementation (requires at least 3
+ // bytes of bitmap data), small arrays are handled differently.
+ *state = ConsumeTiny(array);
+ } else {
+ *state = ConsumeSparse(array);
}
return Status::OK();
@@ -95,57 +114,106 @@ class SumAggregateFunction final : public
AggregateFunctionStaticState<StateType
StateType local;
const auto values = array.raw_values();
- for (int64_t i = 0; i < array.length(); i++) {
+ const int64_t length = array.length();
+ for (int64_t i = 0; i < length; i++) {
local.sum += values[i];
}
- local.count = array.length();
+ local.count = length;
return local;
}
- StateType ConsumeSparse(const ArrayType& array) const {
+ StateType ConsumeTiny(const ArrayType& array) const {
StateType local;
- // TODO(fsaintjacques): This fails on slice not byte-aligned.
- DCHECK_EQ(array.offset() % 8, 0);
-
+ internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
+ array.length());
const auto values = array.raw_values();
- const auto bitmap = array.null_bitmap_data() +
BitUtil::RoundDown(array.offset(), 8);
- const auto length = array.length();
- const auto length_rounded = BitUtil::RoundDown(length, 8);
-
- for (int64_t i = 0; i < length_rounded; i += 8) {
- const uint8_t valid_byte = bitmap[i / 8];
- if (valid_byte < 0xFF) {
-#define SUM_SHIFT(ITEM) \
- static_cast<CType>(values[i + ITEM] * static_cast<CType>(((valid_byte >>
ITEM) & 1U)))
- // Some nulls
- local.sum += SUM_SHIFT(0);
- local.sum += SUM_SHIFT(1);
- local.sum += SUM_SHIFT(2);
- local.sum += SUM_SHIFT(3);
- local.sum += SUM_SHIFT(4);
- local.sum += SUM_SHIFT(5);
- local.sum += SUM_SHIFT(6);
- local.sum += SUM_SHIFT(7);
- local.count += BitUtil::kBytePopcount[valid_byte];
-#undef SUM_SHIFT
- } else {
- // No nulls
- local.sum += values[i + 0] + values[i + 1] + values[i + 2] + values[i
+ 3] +
- values[i + 4] + values[i + 5] + values[i + 6] + values[i
+ 7];
- local.count += 8;
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (reader.IsSet()) {
+ local.sum += values[i];
+ local.count++;
}
+ reader.Next();
}
- for (int64_t i = length_rounded; i < length; ++i) {
- if (BitUtil::GetBit(bitmap, i)) {
+ return local;
+ }
+
+ inline StateType UnrolledSum(uint8_t bits, const CType* values) const {
+ StateType local;
+
+ if (bits < 0xFF) {
+#define SUM_SHIFT(ITEM) values[ITEM] * static_cast<CType>(((bits >> ITEM) &
1U))
+ // Some nulls
+ local.sum += SUM_SHIFT(0);
+ local.sum += SUM_SHIFT(1);
+ local.sum += SUM_SHIFT(2);
+ local.sum += SUM_SHIFT(3);
+ local.sum += SUM_SHIFT(4);
+ local.sum += SUM_SHIFT(5);
+ local.sum += SUM_SHIFT(6);
+ local.sum += SUM_SHIFT(7);
+ local.count += BitUtil::kBytePopcount[bits];
+#undef SUM_SHIFT
+ } else {
+ // No nulls
+ for (size_t i = 0; i < 8; i++) {
local.sum += values[i];
- local.count++;
}
+ local.count += 8;
+ }
+
+ return local;
+ }
+
+ StateType ConsumeSparse(const ArrayType& array) const {
+ StateType local;
+
+ // Sliced bitmaps on non-byte positions induce problem with the branchless
+ // unrolled technique. Thus extra padding is added on both left and right
+ // side of the slice such that both ends are byte-aligned. The first and
+ // last bitmap are properly masked to ignore extra values induced by
+ // padding.
+ //
+ // The execution is divided in 3 sections.
+ //
+ // 1. Compute the sum of the first masked byte.
+ // 2. Compute the sum of the middle bytes
+ // 3. Compute the sum of the last masked byte.
+
+ const int64_t length = array.length();
+ const int64_t offset = array.offset();
+
+ // The number of bytes covering the range, this includes partial bytes.
+ // This number bounded by `<= (length / 8) + 2`, e.g. a possible extra byte
+ // on the left, and on the right.
+ const int64_t covering_bytes = CoveringBytes(offset, length);
+
+ // Align values to the first batch of 8 elements. Note that raw_values() is
+ // already adjusted with the offset, thus we rewind a little to align to
+ // the closest 8-batch offset.
+ const auto values = array.raw_values() - (offset % 8);
+
+ // Align bitmap at the first consumable byte.
+ const auto bitmap = array.null_bitmap_data() + BitUtil::RoundDown(offset,
8) / 8;
+
+ // Consume the first (potentially partial) byte.
+ const uint8_t first_mask = BitUtil::kTrailingBitmask[offset % 8];
+ local += UnrolledSum(bitmap[0] & first_mask, values);
+
+ // Consume the (full) middle bytes. The loop iterates in unit of
+ // batches of 8 values and 1 byte of bitmap.
+ for (int64_t i = 1; i < covering_bytes - 1; i++) {
+ local += UnrolledSum(bitmap[i], &values[i * 8]);
}
+ // Consume the last (potentially partial) byte.
+ const int64_t last_idx = covering_bytes - 1;
+ const uint8_t last_mask = BitUtil::kPrecedingWrappingBitmask[(offset +
length) % 8];
+ local += UnrolledSum(bitmap[last_idx] & last_mask, &values[last_idx * 8]);
+
return local;
}
};
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 888b79d..53b5588 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -378,6 +378,7 @@ static constexpr uint8_t kFlippedBitmask[] = {254, 253,
251, 247, 239, 223, 191,
// Bitmask selecting the (k - 1) preceding bits in a byte
static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127};
+static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31,
63, 127};
// the bitwise complement version of kPrecedingBitmask
static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224,
192, 128};