cyb70289 commented on a change in pull request #9604:
URL: https://github.com/apache/arrow/pull/9604#discussion_r585317366



##########
File path: cpp/src/arrow/compute/kernels/aggregate_var_std.cc
##########
@@ -30,6 +30,96 @@ namespace internal {
 namespace {
 
 using arrow::internal::int128_t;
+using arrow::internal::VisitSetBitRunsVoid;
+
+// non-recursive pairwise summation for floating points
+// https://en.wikipedia.org/wiki/Pairwise_summation
+template <typename ValueType, typename SumType, typename ValueFunc>
+enable_if_t<std::is_floating_point<SumType>::value, SumType> SumArray(
+    const ArrayData& data, ValueFunc&& func) {
+  const int64_t data_size = data.length - data.GetNullCount();
+  if (data_size == 0) {
+    return 0;
+  }
+
+  // number of inputs to accumulate before merging with another block
+  constexpr int kBlockSize = 16;  // same as numpy
+  // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary
+  const int levels = BitUtil::Log2(static_cast<uint64_t>(data_size)) + 1;
+  // temporary summation per level
+  std::vector<SumType> sum(levels);
+  // whether two summations are ready and should be reduced to upper level
+  // one bit for each level, bit0 -> level0, ...
+  uint64_t mask = 0;
+  // level of root node holding the final summation
+  int root_level = 0;
+
+  // reduce summation of one block (may be smaller than kBlockSize) from leaf 
node
+  // continue reducing to upper level if two summations are ready for non-leaf 
node
+  auto reduce = [&](SumType block_sum) {
+    int cur_level = 0;
+    uint64_t cur_level_mask = 1ULL;
+    sum[cur_level] += block_sum;
+    mask ^= cur_level_mask;
+    while ((mask & cur_level_mask) == 0) {
+      block_sum = sum[cur_level];
+      sum[cur_level] = 0;
+      ++cur_level;
+      DCHECK_LT(cur_level, levels);
+      cur_level_mask <<= 1;
+      sum[cur_level] += block_sum;
+      mask ^= cur_level_mask;
+    }
+    root_level = std::max(root_level, cur_level);
+  };
+
+  const ValueType* values = data.GetValues<ValueType>(1);
+  VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+                      [&](int64_t pos, int64_t len) {
+                        const ValueType* v = &values[pos];
+                        const int64_t blocks = len / kBlockSize;
+                        const int64_t remains = len % kBlockSize;

Review comment:
       Changed to unsigned division. Not using shift/mask as kBlockSize may be 
not 2^n.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to