Re: [PR] GH-35166: [C++] Increase precision of decimals in sum aggregates [arrow]

via GitHub Tue, 10 Jun 2025 01:39:17 -0700


pitrou commented on code in PR #44184:
URL: https://github.com/apache/arrow/pull/44184#discussion_r2137063623



##########
cpp/src/arrow/compute/kernels/codegen_internal.cc:
##########
@@ -528,6 +533,28 @@ Status CastDecimalArgs(TypeHolder* begin, size_t count) {
   return Status::OK();
 }
 
+Result<std::shared_ptr<DataType>> WidenDecimalToMaxPrecision(
+    std::shared_ptr<DataType> type) {
+  DCHECK(is_decimal(type->id()));

Review Comment:
   Error checking here is weird. Why are we using asserts if we're also 
returning a `Result` in case of error? We should do one or the other.



##########
cpp/src/arrow/compute/kernels/hash_aggregate_numeric.cc:
##########
@@ -259,8 +271,11 @@ struct GroupedReducingFactory {
 // Sum implementation
 
 template <typename Type>
-struct GroupedSumImpl : public GroupedReducingAggregator<Type, 
GroupedSumImpl<Type>> {
-  using Base = GroupedReducingAggregator<Type, GroupedSumImpl<Type>>;
+struct GroupedSumImpl
+    : public GroupedReducingAggregator<Type, GroupedSumImpl<Type>,
+                                       typename 
FindAccumulatorType<Type>::Type> {

Review Comment:
   I'm curious, why was this change necessary?



##########
docs/source/cpp/compute.rst:
##########
@@ -171,8 +171,8 @@ To avoid exhaustively listing supported types, the tables 
below use a number
 of general type categories:
 
 * "Numeric": Integer types (Int8, etc.) and Floating-point types (Float32,
-  Float64, sometimes Float16).  Some functions also accept Decimal128 and
-  Decimal256 input.
+  Float64, sometimes Float16).  Some functions also accept
+  Decimal128/256 input.

Review Comment:
   More and more functions now also accept decimal32/decimal64.
   ```suggestion
     Float64, sometimes Float16).  Some functions also accept Decimal input.
   ```



##########
python/pyarrow/tests/test_compute.py:
##########
@@ -355,10 +355,55 @@ def test_sum_array(arrow_type):
 
     arr = pa.array([], type=arrow_type)
     assert arr.sum().as_py() is None  # noqa: E711
+    assert pc.sum(arr).as_py() is None  # noqa: E711

Review Comment:
   You don't have to test both `arr.sum()` and `pc.sum(arr)` everytime since 
the former is just calling the latter. This will make the tests more readable 
and easier to maintain.



##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -495,51 +495,67 @@ TEST_F(TestSumKernelRoundOff, Basics) {
 }
 
 TEST(TestDecimalSumKernel, SimpleSum) {
-  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+  std::vector<std::shared_ptr<DataType>> init_types = {decimal128(3, 2),
+                                                       decimal256(3, 2)};
+  std::vector<std::shared_ptr<DataType>> out_types = {decimal128(38, 2),
+                                                      decimal256(76, 2)};
+
+  for (size_t i = 0; i < init_types.size(); ++i) {
+    auto& ty = init_types[i];
+    auto& out_ty = out_types[i];

Review Comment:
   Nit: make those const
   ```suggestion
       const auto& ty = init_types[i];
       const auto& out_ty = out_types[i];
   ```



##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -495,51 +495,67 @@ TEST_F(TestSumKernelRoundOff, Basics) {
 }
 
 TEST(TestDecimalSumKernel, SimpleSum) {
-  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+  std::vector<std::shared_ptr<DataType>> init_types = {decimal128(3, 2),
+                                                       decimal256(3, 2)};
+  std::vector<std::shared_ptr<DataType>> out_types = {decimal128(38, 2),
+                                                      decimal256(76, 2)};
+
+  for (size_t i = 0; i < init_types.size(); ++i) {
+    auto& ty = init_types[i];
+    auto& out_ty = out_types[i];
+
     EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([])")),
-                ResultWith(ScalarFromJSON(ty, R"(null)")));
+                ResultWith(ScalarFromJSON(out_ty, R"(null)")));
     EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([null])")),
-                ResultWith(ScalarFromJSON(ty, R"(null)")));
+                ResultWith(ScalarFromJSON(out_ty, R"(null)")));
     EXPECT_THAT(
         Sum(ArrayFromJSON(ty, R"(["0.00", "1.01", "2.02", "3.03", "4.04", 
"5.05"])")),
-        ResultWith(ScalarFromJSON(ty, R"("15.15")")));
+        ResultWith(ScalarFromJSON(out_ty, R"("15.15")")));
     Datum chunks =
         ChunkedArrayFromJSON(ty, {R"(["0.00", "1.01", "2.02", "3.03", "4.04", 
"5.05"])"});
-    EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(ty, R"("15.15")")));
+    EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(out_ty, R"("15.15")")));
     chunks = ChunkedArrayFromJSON(
         ty, {R"(["0.00", "1.01", "2.02"])", R"(["3.03", "4.04", "5.05"])"});
-    EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(ty, R"("15.15")")));
+    EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(out_ty, R"("15.15")")));
     chunks = ChunkedArrayFromJSON(
         ty, {R"(["0.00", "1.01", "2.02"])", "[]", R"(["3.03", "4.04", 
"5.05"])"});
-    EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(ty, R"("15.15")")));
+    EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(out_ty, R"("15.15")")));
 
     ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0);
     EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([])"), options),
-                ResultWith(ScalarFromJSON(ty, R"("0.00")")));
+                ResultWith(ScalarFromJSON(out_ty, R"("0.00")")));
     EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([null])"), options),
-                ResultWith(ScalarFromJSON(ty, R"("0.00")")));
+                ResultWith(ScalarFromJSON(out_ty, R"("0.00")")));
     chunks = ChunkedArrayFromJSON(ty, {});
-    EXPECT_THAT(Sum(chunks, options), ResultWith(ScalarFromJSON(ty, 
R"("0.00")")));
+    EXPECT_THAT(Sum(chunks, options), ResultWith(ScalarFromJSON(out_ty, 
R"("0.00")")));
 
     EXPECT_THAT(
         Sum(ArrayFromJSON(ty, R"(["1.01", null, "3.03", null, "5.05", null, 
"7.07"])"),
             options),
-        ResultWith(ScalarFromJSON(ty, R"("16.16")")));
+        ResultWith(ScalarFromJSON(out_ty, R"("16.16")")));
 
     EXPECT_THAT(Sum(ScalarFromJSON(ty, R"("5.05")")),
-                ResultWith(ScalarFromJSON(ty, R"("5.05")")));
+                ResultWith(ScalarFromJSON(out_ty, R"("5.05")")));
     EXPECT_THAT(Sum(ScalarFromJSON(ty, R"(null)")),
-                ResultWith(ScalarFromJSON(ty, R"(null)")));
+                ResultWith(ScalarFromJSON(out_ty, R"(null)")));
     EXPECT_THAT(Sum(ScalarFromJSON(ty, R"(null)"), options),
-                ResultWith(ScalarFromJSON(ty, R"("0.00")")));
+                ResultWith(ScalarFromJSON(out_ty, R"("0.00")")));

Review Comment:
   Ok, but can we also test the effects of the added precision? i.e. test a 
case that would fail with the previous behavior of not increasing precision.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-35166: [C++] Increase precision of decimals in sum aggregates [arrow]

Reply via email to