This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b7044a1 ARROW-7336: [C++][Compute] fix minmax kernel options
b7044a1 is described below
commit b7044a15b0008f85c0ba619ea60a1515dd657e1c
Author: Yuan Zhou <[email protected]>
AuthorDate: Thu Apr 9 12:11:03 2020 -0400
ARROW-7336: [C++][Compute] fix minmax kernel options
minmax kernel has MinMaxOptions but not used.
This patch allows user to make use of this option
Signed-off-by: Yuan Zhou <[email protected]>
Closes #5981 from zhouyuan/wip_minmax_option
Lead-authored-by: Yuan Zhou <[email protected]>
Co-authored-by: Benjamin Kietzman <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/compute/kernels/aggregate_test.cc | 90 +++++++++++++++++++++----
cpp/src/arrow/compute/kernels/minmax.cc | 22 +++++-
cpp/src/arrow/compute/kernels/minmax.h | 13 +++-
3 files changed, 108 insertions(+), 17 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index e139675..bb48861 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -40,6 +40,7 @@
namespace arrow {
+using internal::checked_cast;
using internal::checked_pointer_cast;
namespace compute {
@@ -364,25 +365,54 @@ class TestNumericMinMaxKernel : public ComputeFixture,
public TestBase {
using ScalarType = typename Traits::ScalarType;
public:
- template <typename Min, typename Max>
- void AssertMinMaxIs(std::string array_json, Min expected_min, Max
expected_max,
+ void AssertMinMaxIs(const Datum& array, c_type expected_min, c_type
expected_max,
const MinMaxOptions& options) {
- auto array = ArrayFromJSON(Traits::type_singleton(), array_json);
- Datum out, out_min, out_max;
- ASSERT_OK(MinMax(&this->ctx_, options, *array, &out));
+ Datum out;
+ ASSERT_OK(MinMax(&this->ctx_, options, array, &out));
ASSERT_TRUE(out.is_collection());
- auto col = out.collection();
- out_min = col[0];
+ Datum out_min = out.collection()[0];
ASSERT_TRUE(out_min.is_scalar());
- auto min = checked_pointer_cast<ScalarType>(out_min.scalar());
- ASSERT_EQ(min->value, static_cast<c_type>(expected_min));
+ ASSERT_EQ(checked_cast<const ScalarType&>(*out_min.scalar()).value,
expected_min);
- out_max = col[1];
+ Datum out_max = out.collection()[1];
ASSERT_TRUE(out_max.is_scalar());
- auto max = checked_pointer_cast<ScalarType>(out_max.scalar());
- ASSERT_EQ(max->value, static_cast<c_type>(expected_max));
+ ASSERT_EQ(checked_cast<const ScalarType&>(*out_max.scalar()).value,
expected_max);
+ }
+
+ void AssertMinMaxIs(const std::string& json, c_type expected_min, c_type
expected_max,
+ const MinMaxOptions& options) {
+ auto array = ArrayFromJSON(Traits::type_singleton(), json);
+ AssertMinMaxIs(array, expected_min, expected_max, options);
+ }
+
+ void AssertMinMaxIs(const std::vector<std::string>& json, c_type
expected_min,
+ c_type expected_max, const MinMaxOptions& options) {
+ auto array = ChunkedArrayFromJSON(Traits::type_singleton(), json);
+ AssertMinMaxIs(array, expected_min, expected_max, options);
+ }
+
+ void AssertMinMaxIsNull(const Datum& array, const MinMaxOptions& options) {
+ Datum out;
+ ASSERT_OK(MinMax(&this->ctx_, options, array, &out));
+
+ ASSERT_TRUE(out.is_collection());
+ for (const auto& out : out.collection()) {
+ ASSERT_TRUE(out.is_scalar());
+ ASSERT_FALSE(out.scalar()->is_valid);
+ }
+ }
+
+ void AssertMinMaxIsNull(const std::string& json, const MinMaxOptions&
options) {
+ auto array = ArrayFromJSON(Traits::type_singleton(), json);
+ AssertMinMaxIsNull(array, options);
+ }
+
+ void AssertMinMaxIsNull(const std::vector<std::string>& json,
+ const MinMaxOptions& options) {
+ auto array = ChunkedArrayFromJSON(Traits::type_singleton(), json);
+ AssertMinMaxIsNull(array, options);
}
};
@@ -392,18 +422,54 @@ class TestFloatingMinMaxKernel : public
TestNumericMinMaxKernel<ArrowType> {};
TYPED_TEST_SUITE(TestNumericMinMaxKernel, IntegralArrowTypes);
TYPED_TEST(TestNumericMinMaxKernel, Basics) {
MinMaxOptions options;
+ std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null,
3, 4]"};
+ std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2,
3, 4]"};
+ std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1,
null, 3, 4]"};
+
this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options);
+ this->AssertMinMaxIs(chunked_input1, 1, 9, options);
+ this->AssertMinMaxIs(chunked_input2, 1, 9, options);
+ this->AssertMinMaxIs(chunked_input3, 1, 9, options);
+
+ options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL);
+ this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+ // output null
+ this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
+ // output null
+ this->AssertMinMaxIsNull(chunked_input1, options);
+ this->AssertMinMaxIsNull(chunked_input2, options);
+ this->AssertMinMaxIsNull(chunked_input3, options);
}
TYPED_TEST_SUITE(TestFloatingMinMaxKernel, RealArrowTypes);
TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
MinMaxOptions options;
+ std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null,
3, 4]"};
+ std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2,
3, 4]"};
+ std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1,
null, 3, 4]"};
+
+ this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options);
this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options);
this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options);
this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
+ this->AssertMinMaxIs(chunked_input1, 1, 9, options);
+ this->AssertMinMaxIs(chunked_input2, 1, 9, options);
+ this->AssertMinMaxIs(chunked_input3, 1, 9, options);
+
+ options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL);
+ this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+ this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
+ // output null
+ this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
+ // output null
+ this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options);
+ // output null
+ this->AssertMinMaxIsNull(chunked_input1, options);
+ this->AssertMinMaxIsNull(chunked_input2, options);
+ this->AssertMinMaxIsNull(chunked_input3, options);
}
} // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/minmax.cc
b/cpp/src/arrow/compute/kernels/minmax.cc
index 0aff757..fb6f810 100644
--- a/cpp/src/arrow/compute/kernels/minmax.cc
+++ b/cpp/src/arrow/compute/kernels/minmax.cc
@@ -39,6 +39,7 @@ struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> {
using c_type = typename ArrowType::c_type;
ThisType& operator+=(const ThisType& rhs) {
+ this->has_nulls |= rhs.has_nulls;
this->min = std::min(this->min, rhs.min);
this->max = std::max(this->max, rhs.max);
return *this;
@@ -51,6 +52,7 @@ struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> {
c_type min = std::numeric_limits<c_type>::max();
c_type max = std::numeric_limits<c_type>::min();
+ bool has_nulls = false;
};
template <typename ArrowType>
@@ -59,6 +61,7 @@ struct MinMaxState<ArrowType,
enable_if_floating_point<ArrowType>> {
using c_type = typename ArrowType::c_type;
ThisType& operator+=(const ThisType& rhs) {
+ this->has_nulls |= rhs.has_nulls;
this->min = std::fmin(this->min, rhs.min);
this->max = std::fmax(this->max, rhs.max);
return *this;
@@ -71,6 +74,7 @@ struct MinMaxState<ArrowType,
enable_if_floating_point<ArrowType>> {
c_type min = std::numeric_limits<c_type>::infinity();
c_type max = -std::numeric_limits<c_type>::infinity();
+ bool has_nulls = false;
};
template <typename ArrowType>
@@ -84,10 +88,16 @@ class MinMaxAggregateFunction final
Status Consume(const Array& array, StateType* state) const override {
StateType local;
+ local.has_nulls = array.null_count() > 0;
+ if (local.has_nulls && options_.null_handling ==
MinMaxOptions::OUTPUT_NULL) {
+ *state = local;
+ return Status::OK();
+ }
+
const auto values =
checked_cast<const typename TypeTraits<ArrowType>::ArrayType&>(array)
.raw_values();
- if (array.null_count() != 0) {
+ if (array.null_count() > 0) {
internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
array.length());
for (int64_t i = 0; i < array.length(); i++) {
@@ -101,6 +111,7 @@ class MinMaxAggregateFunction final
local.MergeOne(values[i]);
}
}
+
*state = local;
return Status::OK();
}
@@ -111,7 +122,14 @@ class MinMaxAggregateFunction final
}
Status Finalize(const StateType& src, Datum* output) const override {
- *output = Datum({Datum(src.min), Datum(src.max)});
+ using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+ if (src.has_nulls && options_.null_handling == MinMaxOptions::OUTPUT_NULL)
{
+ *output = Datum(
+ {Datum(std::make_shared<ScalarType>()),
Datum(std::make_shared<ScalarType>())});
+ } else {
+ *output = Datum({Datum(src.min), Datum(src.max)});
+ }
+
return Status::OK();
}
diff --git a/cpp/src/arrow/compute/kernels/minmax.h
b/cpp/src/arrow/compute/kernels/minmax.h
index 4ac4e31..237796c 100644
--- a/cpp/src/arrow/compute/kernels/minmax.h
+++ b/cpp/src/arrow/compute/kernels/minmax.h
@@ -36,11 +36,18 @@ class AggregateFunction;
/// \class MinMaxOptions
///
/// The user can control the MinMax kernel behavior with this class. By
default,
-/// it will return null if there is a null value present.
+/// it will skip null if there is a null value present.
struct ARROW_EXPORT MinMaxOptions {
- // MinMaxOptions() : skip_nulls(false) {}
+ enum mode {
+ /// skip null values
+ SKIP = 0,
+ /// any nulls will result in null output
+ OUTPUT_NULL
+ };
- // bool skip_nulls;
+ explicit MinMaxOptions(enum mode null_handling = SKIP) :
null_handling(null_handling) {}
+
+ enum mode null_handling = SKIP;
};
/// \brief Return a Min/Max Kernel