pitrou commented on code in PR #37536:
URL: https://github.com/apache/arrow/pull/37536#discussion_r1315085920
##########
cpp/src/arrow/compute/kernels/aggregate_internal.h:
##########
@@ -221,27 +221,34 @@ enable_if_t<std::is_floating_point<SumType>::value,
SumType> SumArray(
}
// naive summation for integers and decimals
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
bool Checked,
typename ValueFunc>
enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
- const ArraySpan& data, ValueFunc&& func) {
+ const ArraySpan& data, ValueFunc&& func, Status* status) {
using arrow::internal::VisitSetBitRunsVoid;
SumType sum = 0;
const ValueType* values = data.GetValues<ValueType>(1);
- VisitSetBitRunsVoid(data.buffers[0].data, data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- sum += func(values[pos + i]);
- }
- });
+ VisitSetBitRunsVoid(
+ data.buffers[0].data, data.offset, data.length, [&](int64_t pos, int64_t
len) {
+ for (int64_t i = 0; i < len; ++i) {
+ if constexpr (Checked) {
+ sum = AddChecked::Call<SumType>(nullptr, sum, func(values[pos +
i]), status);
+ if (ARROW_PREDICT_FALSE(!status->ok())) {
Review Comment:
Returning here seems a bit pointless as the next bit runs will still be read
anyway.
Instead, perhaps you should move the `Checked` condition at the function
toplevel and use `VisitSetBitRuns` (non-void) if checked summation is requested.
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -176,11 +202,11 @@ struct MeanImpl;
template <typename ArrowType, SimdLevel::type SimdLevel>
struct MeanImpl<ArrowType, SimdLevel, enable_if_decimal<ArrowType>>
- : public SumImpl<ArrowType, SimdLevel> {
- using SumImpl<ArrowType, SimdLevel>::SumImpl;
- using SumImpl<ArrowType, SimdLevel>::options;
- using SumCType = typename SumImpl<ArrowType, SimdLevel>::SumCType;
- using OutputType = typename SumImpl<ArrowType, SimdLevel>::OutputType;
+ : public SumImpl<ArrowType, SimdLevel, false> {
Review Comment:
Suggesting, for here and other similar places: make it clear what the
boolean parameter selects
```suggestion
: public SumImpl<ArrowType, SimdLevel, /*Checked=*/ false> {
```
##########
cpp/src/arrow/compute/kernels/aggregate_var_std.cc:
##########
@@ -60,16 +60,19 @@ struct VarStdState {
if (count == 0 || (!this->all_valid && !options.skip_nulls)) {
return;
}
-
+ Status status;
using SumType = typename internal::GetSumType<T>::SumType;
- SumType sum = internal::SumArray<CType, SumType, SimdLevel::NONE>(array);
+ SumType sum =
+ internal::SumArray<CType, SumType, SimdLevel::NONE, false>(array,
&status);
const double mean = ToDouble(sum) / count;
- const double m2 = internal::SumArray<CType, double, SimdLevel::NONE>(
- array, [this, mean](CType value) {
+ const double m2 = internal::SumArray<CType, double, SimdLevel::NONE,
false>(
+ array,
+ [this, mean](CType value) {
const double v = ToDouble(value);
return (v - mean) * (v - mean);
- });
+ },
+ &status);
Review Comment:
Let's make sure we don't let an error slip here? (it shouldn't happen now
but perhaps it will at some point?)
```suggestion
&status);
ARROW_CHECK_OK(status);
```
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -49,20 +50,22 @@ void AddMinMaxKernel(KernelInit init,
internal::detail::GetTypeId get_id,
// SIMD variants for kernels
void AddSumAvx2AggKernels(ScalarAggregateFunction* func);
+void AddSumCheckedAvx2AggKernels(ScalarAggregateFunction* func);
void AddMeanAvx2AggKernels(ScalarAggregateFunction* func);
void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func);
void AddSumAvx512AggKernels(ScalarAggregateFunction* func);
+void AddSumCheckedAvx512AggKernels(ScalarAggregateFunction* func);
void AddMeanAvx512AggKernels(ScalarAggregateFunction* func);
void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
// ----------------------------------------------------------------------
// Sum implementation
-template <typename ArrowType, SimdLevel::type SimdLevel,
+template <typename ArrowType, SimdLevel::type SimdLevel, bool Checked,
Review Comment:
TBH, I'm not sure `Checked` and `SimdLevel` need to be template parameters
at this level. `SumArray` is performance critical, but the orchestration in
`SumImpl` probably much less so.
Not making those template parameters may save a bit on code generation and
compilation times. Perhaps open an issue about that?
##########
cpp/src/arrow/compute/api_aggregate.h:
##########
@@ -284,6 +284,21 @@ Result<Datum> Sum(
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
+/// \brief Sum values of a numeric array. Return error if overflow occurs
+///
+/// \param[in] value datum to sum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 1.0.0
Review Comment:
We should not blindly copy this from another docstring :-) The next version
to be released is 14.0.0.
##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -592,25 +652,51 @@ TEST(TestNullSumKernel, Basics) {
Datum null_result = std::make_shared<Int64Scalar>();
Datum zero_result = std::make_shared<Int64Scalar>(0);
- EXPECT_THAT(Sum(ScalarFromJSON(ty, "null")), ResultWith(null_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]")), ResultWith(null_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]")), ResultWith(null_result));
- EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"})),
- ResultWith(null_result));
+ for (auto func : {Sum, SumChecked}) {
+ SCOPED_TRACE(func);
+ auto default_options = ScalarAggregateOptions::Defaults();
+ EXPECT_THAT(func(ScalarFromJSON(ty, "null"), default_options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), default_options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), default_options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null,
null]"}),
+ default_options, nullptr),
+ ResultWith(null_result));
- ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
- EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options),
ResultWith(zero_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options),
ResultWith(zero_result));
- EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}),
options),
- ResultWith(zero_result));
+ ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
+ EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+ ResultWith(zero_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr),
ResultWith(zero_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+ ResultWith(zero_result));
+ EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null,
null]"}), options,
+ nullptr),
+ ResultWith(zero_result));
+
+ options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
+ EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr),
ResultWith(zero_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null,
null]"}), options,
+ nullptr),
+ ResultWith(null_result));
+ }
+}
- options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
- EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options),
ResultWith(null_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options),
ResultWith(null_result));
- EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}),
options),
- ResultWith(null_result));
+TEST(TestSumKernel, Overflow) {
+ int64_t large_scalar = 1000000000000000;
+ int64_t length = 10000;
+ auto scalar = std::make_shared<Int64Scalar>(large_scalar);
+ auto array = MakeArrayFromScalar(*scalar, length);
+
+#ifndef ARROW_UBSAN
Review Comment:
I think you can remove this condition if you use a `uint64` instead, as
unsigned overflow doesn't trigger UB.
##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -360,6 +380,10 @@ TYPED_TEST(TestNumericSumKernel, SimpleSum) {
ResultWith(Datum(std::make_shared<ScalarType>(static_cast<T>(5)))));
EXPECT_THAT(Sum(MakeNullScalar(TypeTraits<TypeParam>::type_singleton())),
ResultWith(Datum(MakeNullScalar(TypeTraits<SumType>::type_singleton()))));
+
EXPECT_THAT(SumChecked(Datum(std::make_shared<InputScalarType>(static_cast<T>(5)))),
+
ResultWith(Datum(std::make_shared<ScalarType>(static_cast<T>(5)))));
+
EXPECT_THAT(SumChecked(MakeNullScalar(TypeTraits<TypeParam>::type_singleton())),
+
ResultWith(Datum(MakeNullScalar(TypeTraits<SumType>::type_singleton()))));
Review Comment:
Not sure that would work, but instead of repeating these lines, perhaps you
can iterate over the sum functions:
```c++
for (auto&& sum_func : {Sum, SumChecked}) ...
```
##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -592,25 +652,51 @@ TEST(TestNullSumKernel, Basics) {
Datum null_result = std::make_shared<Int64Scalar>();
Datum zero_result = std::make_shared<Int64Scalar>(0);
- EXPECT_THAT(Sum(ScalarFromJSON(ty, "null")), ResultWith(null_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]")), ResultWith(null_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]")), ResultWith(null_result));
- EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"})),
- ResultWith(null_result));
+ for (auto func : {Sum, SumChecked}) {
+ SCOPED_TRACE(func);
+ auto default_options = ScalarAggregateOptions::Defaults();
+ EXPECT_THAT(func(ScalarFromJSON(ty, "null"), default_options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), default_options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), default_options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null,
null]"}),
+ default_options, nullptr),
+ ResultWith(null_result));
- ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
- EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options),
ResultWith(zero_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options),
ResultWith(zero_result));
- EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}),
options),
- ResultWith(zero_result));
+ ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
+ EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+ ResultWith(zero_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr),
ResultWith(zero_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+ ResultWith(zero_result));
+ EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null,
null]"}), options,
+ nullptr),
+ ResultWith(zero_result));
+
+ options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
+ EXPECT_THAT(func(ScalarFromJSON(ty, "null"), options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[]"), options, nullptr),
ResultWith(zero_result));
+ EXPECT_THAT(func(ArrayFromJSON(ty, "[null]"), options, nullptr),
+ ResultWith(null_result));
+ EXPECT_THAT(func(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null,
null]"}), options,
+ nullptr),
+ ResultWith(null_result));
+ }
+}
- options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/0);
- EXPECT_THAT(Sum(ScalarFromJSON(ty, "null"), options),
ResultWith(null_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[]"), options), ResultWith(zero_result));
- EXPECT_THAT(Sum(ArrayFromJSON(ty, "[null]"), options),
ResultWith(null_result));
- EXPECT_THAT(Sum(ChunkedArrayFromJSON(ty, {"[null]", "[]", "[null, null]"}),
options),
- ResultWith(null_result));
+TEST(TestSumKernel, Overflow) {
+ int64_t large_scalar = 1000000000000000;
+ int64_t length = 10000;
+ auto scalar = std::make_shared<Int64Scalar>(large_scalar);
Review Comment:
Can we test overflow using more types (including decimals)? It should be
relatively simple even without templating if you use `ScalarFromJSON`.
Also, this is testing overflow when summing a broadcast scalar, but we
should also test overflow within an array.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]