pitrou commented on a change in pull request #11111:
URL: https://github.com/apache/arrow/pull/11111#discussion_r707517448



##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -498,6 +498,11 @@ struct ValidateArrayFullImpl {
 
   Status Visit(const StructType& type) {
     // Validate children
+    if (data.child_data.size() != static_cast<size_t>(type.num_fields())) {

Review comment:
       I'm a bit surprised. This should already be checked by the 
`ValidateArray` top-level function, no?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -322,6 +411,7 @@ struct MinMaxImpl : public ScalarAggregator {
     StateType local;
     local.has_nulls = !scalar.is_valid;
     local.has_values = scalar.is_valid;
+    this->count += scalar.is_valid;

Review comment:
       `has_values` is a bit redundant now that the count of valid values is 
tracked, isn't it?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -289,27 +313,92 @@ struct MinMaxState<ArrowType, SimdLevel, 
enable_if_decimal<ArrowType>> {
     return *this;
   }
 
-  void MergeOne(const uint8_t* value) { MergeOne(T(value)); }
+  void MergeOne(util::string_view value) {
+    MergeOne(T(reinterpret_cast<const uint8_t*>(value.data())));
+  }
 
   void MergeOne(const T value) {
     this->min = std::min(this->min, value);
     this->max = std::max(this->max, value);
   }
 
+  Status GetValues(const std::shared_ptr<DataType>& type,
+                   std::vector<std::shared_ptr<Scalar>>* values) {
+    values->emplace_back(std::make_shared<ScalarType>(min, type));
+    values->emplace_back(std::make_shared<ScalarType>(max, type));
+    return Status::OK();
+  }
+
   T min;
   T max;
   bool has_nulls = false;
   bool has_values = false;
 };
 
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel,
+                   enable_if_t<is_base_binary_type<ArrowType>::value ||
+                               std::is_same<ArrowType, 
FixedSizeBinaryType>::value>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    if (!this->seen && rhs.seen) {
+      this->min = rhs.min;
+      this->max = rhs.max;
+    } else if (this->seen && rhs.seen) {
+      if (this->min > rhs.min) {
+        this->min = rhs.min;
+      }
+      if (this->max < rhs.max) {
+        this->max = rhs.max;
+      }
+    }
+    this->has_nulls |= rhs.has_nulls;
+    this->has_values |= rhs.has_values;
+    this->seen |= rhs.seen;
+    return *this;
+  }
+
+  void MergeOne(util::string_view value) {
+    if (!seen) {
+      this->min = std::string(value);
+      this->max = std::string(value);
+    } else {
+      if (value < util::string_view(this->min)) {
+        this->min = std::string(value);
+      }
+      if (value > util::string_view(this->max)) {

Review comment:
       Should this be `else if`? Presumably `value` cannot be bother smaller 
than the min and greater than the max, and it would spare a string comparison.

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1762,9 +1768,47 @@ struct AntiExtrema<Decimal256> {
 };
 
 template <typename Type>
-struct GroupedMinMaxImpl : public GroupedAggregator {
+struct GroupedMinMaxTraits {
   using CType = typename TypeTraits<Type>::CType;
 
+  static void Update(CType* raw_mins, CType* raw_maxes, uint32_t g, CType val) 
{
+    raw_maxes[g] = std::max(raw_maxes[g], val);
+    raw_mins[g] = std::min(raw_mins[g], val);
+  }
+
+  static void Merge(CType* raw_mins, CType* raw_maxes, uint32_t g,
+                    const CType* other_raw_mins, const CType* other_raw_maxes,
+                    uint32_t other_g) {
+    raw_mins[g] = std::min(raw_mins[g], other_raw_mins[other_g]);
+    raw_maxes[g] = std::max(raw_maxes[g], other_raw_maxes[other_g]);
+  }
+};

Review comment:
       Note that this not very specific to min_max and you could simply have 
(assuming this is later useful for other types and aggregations?):
   ```c++
   template <typename T>
   struct GroupedValueTraits {
     using CType = typename TypeTraits<Type>::CType;
   
     static CType Get(const CType* values, uint32_t g) {
       return values[g];
     }
     static void Set(CType* values, uint32_t g, CType v) {
       values[g] = v;
     }
   };
   
   template <>
   struct GroupedValueTraits<BooleanType> {
     static bool Get(const uint8_t* values, uint32_t g) {
       return BitUtil::GetBit(values, g);
     }
     static void Set(CType* values, uint32_t g, CType v) {
       BitUtil::SetBit(values, g, v);
     }
   };
   ```
   

##########
File path: docs/source/cpp/compute.rst
##########
@@ -326,7 +332,10 @@ equivalents above and reflects how they are implemented 
internally.
   are emitted. This never affects the grouping keys, only group values
   (i.e. you may get a group where the key is null).
 
-* \(3) Output is a ``{"min": input type, "max": input type}`` Struct scalar.
+* \(3) Output is a ``{"min": input type, "max": input type}`` Struct array.
+
+  The month-day-nano interval type is not supported as it cannot be

Review comment:
       Same question.

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1737,6 +1737,12 @@ struct AntiExtrema {
   static constexpr CType anti_max() { return 
std::numeric_limits<CType>::min(); }
 };
 
+template <>
+struct AntiExtrema<bool> {
+  static constexpr float anti_min() { return true; }
+  static constexpr float anti_max() { return false; }

Review comment:
       I'm curious, why is float returned?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -220,16 +220,24 @@ struct MinMaxState<ArrowType, SimdLevel, 
enable_if_boolean<ArrowType>> {
     this->max = this->max || value;
   }
 
+  Status GetValues(const std::shared_ptr<DataType>&,
+                   std::vector<std::shared_ptr<Scalar>>* values) {
+    values->emplace_back(std::make_shared<BooleanScalar>(min));
+    values->emplace_back(std::make_shared<BooleanScalar>(max));
+    return Status::OK();
+  }
+
   T min = true;
   T max = false;
   bool has_nulls = false;
   bool has_values = false;
 };
 
 template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_integer<ArrowType>> {
+struct MinMaxState<ArrowType, SimdLevel, 
enable_if_physical_integer<ArrowType>> {

Review comment:
       It seems this will generate different template specializations for types 
with the same underlying `c_type`. Is it desirable?
   (note that `GetValues` is called with the output type independently from the 
`ArrowType` parameter)

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -511,27 +620,44 @@ struct MinMaxInitState {
                   const ScalarAggregateOptions& options)
       : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
 
-  Status Visit(const DataType&) {
-    return Status::NotImplemented("No min/max implemented");
+  Status Visit(const DataType& ty) {
+    return Status::NotImplemented("No min/max implemented for ", ty);
   }
 
   Status Visit(const HalfFloatType&) {
     return Status::NotImplemented("No min/max implemented");

Review comment:
       Update message here as well?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -226,6 +229,9 @@ Notes:
 
 * \(3) Output is a ``{"min": input type, "max": input type}`` Struct.
 
+  The month-day-nano interval type is not supported as it cannot be
+  sorted.

Review comment:
       Neither is day-time, right?

##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.h
##########
@@ -407,12 +407,19 @@ ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec 
exec);
 // Helpers for iterating over common DataType instances for adding kernels to
 // functions
 
+ARROW_EXPORT
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();

Review comment:
       Hmm... this seems to imply that `codegen_internal.h` is included in 
tests, but that's a rather costly inclusion. Instead, we already have some 
declarations in `gtest_util.h` (see `BinaryArrowTypes` and friends).

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1866,17 +1940,40 @@ Result<std::unique_ptr<KernelState>> 
MinMaxInit(KernelContext* ctx,
 
 struct GroupedMinMaxFactory {
   template <typename T>
-  enable_if_number<T, Status> Visit(const T&) {
+  enable_if_physical_integer<T, Status> Visit(const T&) {

Review comment:
       Same remark as for the scalar aggregations: this will generate a 
separate kernel per logical type?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1866,17 +1940,40 @@ Result<std::unique_ptr<KernelState>> 
MinMaxInit(KernelContext* ctx,
 
 struct GroupedMinMaxFactory {
   template <typename T>
-  enable_if_number<T, Status> Visit(const T&) {
+  enable_if_physical_integer<T, Status> Visit(const T&) {
     kernel = MakeKernel(std::move(argument_type), MinMaxInit<T>);
     return Status::OK();
   }
 
+  // MSVC2015 apparently doesn't compile this properly if we use
+  // enable_if_floating_point

Review comment:
       Ha :-)

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
##########
@@ -1305,35 +1305,49 @@ TEST(GroupBy, VarianceOptions) {
 }
 
 TEST(GroupBy, MinMaxOnly) {
+  auto in_schema = schema({
+      field("argument", float64()),
+      field("argument1", null()),
+      field("argument2", boolean()),
+      field("key", int64()),
+  });
   for (bool use_exec_plan : {false, true}) {
     for (bool use_threads : {true, false}) {
       SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
 
-      auto table = TableFromJSON(
-          schema({field("argument", float64()), field("key", int64())}), {R"([
-    [1.0,   1],
-    [null,  1]
-                        ])",
-                                                                          R"([
-    [0.0,   2],
-    [null,  3],
-    [4.0,   null],
-    [3.25,  1],
-    [0.125, 2]
-                        ])",
-                                                                          R"([
-    [-0.25, 2],
-    [0.75,  null],
-    [null,  3]
-                        ])"});
+      // bool, day time interval

Review comment:
       Hmm, I don't see any interval in the type definition above. Is this 
comment outdated?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1786,14 +1830,13 @@ struct GroupedMinMaxImpl : public GroupedAggregator {
   }
 
   Status Consume(const ExecBatch& batch) override {
-    auto raw_mins = reinterpret_cast<CType*>(mins_.mutable_data());
-    auto raw_maxes = reinterpret_cast<CType*>(maxes_.mutable_data());
+    auto raw_mins = reinterpret_cast<ArrType*>(mins_.mutable_data());
+    auto raw_maxes = reinterpret_cast<ArrType*>(maxes_.mutable_data());

Review comment:
       Since `mins_` and `maxes_` are TypedBufferBuilders, is the 
`reinterpret_cast` at all required?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to