pitrou commented on code in PR #49679: URL: https://github.com/apache/arrow/pull/49679#discussion_r3086889266
########## cpp/src/arrow/compute/kernels/vector_search_sorted_test.cc: ########## @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include <gtest/gtest.h> + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util_internal.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { +namespace { + +Result<std::shared_ptr<Array>> REEFromJSON(const std::shared_ptr<DataType>& ree_type, + const std::string& json) { + auto ree_type_ptr = checked_cast<const RunEndEncodedType*>(ree_type.get()); + auto array = ArrayFromJSON(ree_type_ptr->value_type(), json); + ARROW_ASSIGN_OR_RAISE( + auto datum, RunEndEncode(array, RunEndEncodeOptions{ree_type_ptr->run_end_type()})); + return datum.make_array(); +} + +void CheckSimpleSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needles_json, + const std::string& expected_left_json, + const std::string& expected_right_json) { + auto values = ArrayFromJSON(type, values_json); + auto needles = ArrayFromJSON(type, needles_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), expected_left_json), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), expected_right_json), *right.make_array()); +} + +void CheckSimpleScalarSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needle_json, uint64_t expected_left, + uint64_t expected_right) { + auto values = ArrayFromJSON(type, values_json); + auto needle = ScalarFromJSON(type, needle_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needle), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needle), + SearchSortedOptions(SearchSortedOptions::Right))); + + ASSERT_TRUE(left.is_scalar()); + ASSERT_TRUE(right.is_scalar()); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*left.scalar()).value, expected_left); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*right.scalar()).value, expected_right); +} + +struct SearchSortedSmokeCase { + std::string name; + std::shared_ptr<DataType> type; + std::string values_json; + std::string needles_json; + std::string expected_left_json; + std::string expected_right_json; + std::string scalar_needle_json; + uint64_t expected_scalar_left; + uint64_t expected_scalar_right; +}; + +std::vector<SearchSortedSmokeCase> SupportedTypeSmokeCases() { + return { + {"Boolean", boolean(), "[false, false, true, true]", "[false, true]", "[0, 2]", + "[2, 4]", "true", 2, 4}, + {"Int8", int8(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Int16", int16(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Int32", int32(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Int64", int64(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt8", uint8(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt16", uint16(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt32", uint32(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt64", uint64(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Float32", float32(), "[1.0, 3.0, 3.0, 5.0]", "[0.0, 3.0, 4.0, 6.0]", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", "3.0", 1, 3}, + {"Float64", float64(), "[1.0, 3.0, 3.0, 5.0]", "[0.0, 3.0, 4.0, 6.0]", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", "3.0", 1, 3}, + {"Date32", date32(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Date64", date64(), "[86400000, 259200000, 259200000, 432000000]", + "[0, 259200000, 345600000, 518400000]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "259200000", 1, 3}, + {"Time32", time32(TimeUnit::SECOND), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", + "[0, 3, 3, 4]", "3", 1, 3}, + {"Time64", time64(TimeUnit::NANO), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", + "[0, 3, 3, 4]", "3", 1, 3}, + {"Timestamp", timestamp(TimeUnit::SECOND), + R"(["1970-01-02", "1970-01-04", "1970-01-04", "1970-01-06"])", + R"(["1970-01-01", "1970-01-04", "1970-01-05", "1970-01-07"])", "[0, 1, 3, 4]", + "[0, 3, 3, 4]", R"("1970-01-04")", 1, 3}, + {"Duration", duration(TimeUnit::NANO), "[1, 3, 3, 5]", "[0, 3, 4, 6]", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", "3", 1, 3}, + {"Binary", binary(), R"(["aa", "bb", "bb", "dd"])", R"(["a", "bb", "bc", "z"])", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"String", utf8(), R"(["aa", "bb", "bb", "dd"])", R"(["a", "bb", "bc", "z"])", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"LargeBinary", large_binary(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"LargeString", large_utf8(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"BinaryView", binary_view(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"StringView", utf8_view(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + }; +} + +class SearchSortedSupportedTypesTest + : public ::testing::TestWithParam<SearchSortedSmokeCase> {}; + +TEST(SearchSorted, BasicLeftRight) { + auto values = ArrayFromJSON(int64(), "[100, 200, 200, 300, 300]"); + auto needles = ArrayFromJSON(int64(), "[50, 200, 250, 400]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); Review Comment: Let's call `ValidateFull` on both results here too? (also I'm curious, why not reuse `CheckSimpleSearchSorted`?) ########## cpp/src/arrow/compute/kernels/vector_search_sorted_test.cc: ########## @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include <gtest/gtest.h> + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util_internal.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { +namespace { + +Result<std::shared_ptr<Array>> REEFromJSON(const std::shared_ptr<DataType>& ree_type, + const std::string& json) { + auto ree_type_ptr = checked_cast<const RunEndEncodedType*>(ree_type.get()); + auto array = ArrayFromJSON(ree_type_ptr->value_type(), json); + ARROW_ASSIGN_OR_RAISE( + auto datum, RunEndEncode(array, RunEndEncodeOptions{ree_type_ptr->run_end_type()})); + return datum.make_array(); +} + +void CheckSimpleSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needles_json, + const std::string& expected_left_json, + const std::string& expected_right_json) { + auto values = ArrayFromJSON(type, values_json); + auto needles = ArrayFromJSON(type, needles_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), expected_left_json), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), expected_right_json), *right.make_array()); +} + +void CheckSimpleScalarSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needle_json, uint64_t expected_left, + uint64_t expected_right) { + auto values = ArrayFromJSON(type, values_json); + auto needle = ScalarFromJSON(type, needle_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needle), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needle), + SearchSortedOptions(SearchSortedOptions::Right))); + + ASSERT_TRUE(left.is_scalar()); + ASSERT_TRUE(right.is_scalar()); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*left.scalar()).value, expected_left); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*right.scalar()).value, expected_right); +} + +struct SearchSortedSmokeCase { + std::string name; + std::shared_ptr<DataType> type; + std::string values_json; + std::string needles_json; + std::string expected_left_json; + std::string expected_right_json; + std::string scalar_needle_json; + uint64_t expected_scalar_left; + uint64_t expected_scalar_right; Review Comment: Note that you could also generate the scalar needle tests automatically by calling `GetScalar` on the array needles and the expected results. This would make this easier to maintain later. ########## cpp/src/arrow/compute/kernels/vector_search_sorted_benchmark.cc: ########## @@ -0,0 +1,382 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/datum.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { + +constexpr auto kSeed = 0x5EA4C42; +constexpr int64_t kNeedleToValueRatio = 4; +constexpr int64_t kValuesRunLength = 16; +constexpr int64_t kNeedlesRunLength = 8; +constexpr int32_t kStringMinLength = 8; +constexpr int32_t kStringMaxLength = 24; + +void SetSearchSortedQuickArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : std::vector<int64_t>{kL1Size, kL2Size}) { + bench->Arg(size); + } +} + +void SetSearchSortedArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : kMemorySizes) { + bench->Arg(size); + } +} + +int64_t Int64LengthFromBytes(int64_t size_bytes) { + return std::max<int64_t>(1, size_bytes / static_cast<int64_t>(sizeof(int64_t))); +} + +int64_t NeedleLengthFromBytes(int64_t size_bytes) { + return std::max<int64_t>(1, Int64LengthFromBytes(size_bytes) / kNeedleToValueRatio); +} + +int64_t StringLengthFromBytes(int64_t size_bytes) { + const int64_t average_length = (kStringMinLength + kStringMaxLength) / 2; + return std::max<int64_t>(1, size_bytes / average_length); +} + +std::shared_ptr<Int64Array> BuildSortedInt64Values(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed); + const auto length = Int64LengthFromBytes(size_bytes); + const auto max_value = std::max<int64_t>(length / 8, 1); + + auto values = + std::static_pointer_cast<Int64Array>(rand.Int64(length, 0, max_value, 0.0)); + std::vector<int64_t> data(values->raw_values(), + values->raw_values() + values->length()); + std::ranges::sort(data); + + Int64Builder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildInt64Needles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 1); + const auto length = NeedleLengthFromBytes(size_bytes); + const auto max_value = std::max<int64_t>(Int64LengthFromBytes(size_bytes) / 8, 1); + return std::static_pointer_cast<Int64Array>(rand.Int64(length, 0, max_value, 0.0)); +} + +std::shared_ptr<StringArray> BuildSortedStringValues(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 2); + const auto length = StringLengthFromBytes(size_bytes); + auto values = std::static_pointer_cast<StringArray>( + rand.String(length, kStringMinLength, kStringMaxLength, 0.0)); + + std::vector<std::string> data; + data.reserve(static_cast<size_t>(values->length())); + for (int64_t index = 0; index < values->length(); ++index) { + data.push_back(values->GetString(index)); + } + std::ranges::sort(data); + + StringBuilder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<StringArray>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<StringArray> BuildStringNeedles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 3); + const auto length = std::max<int64_t>(1, StringLengthFromBytes(size_bytes) / 4); + return std::static_pointer_cast<StringArray>( + rand.String(length, kStringMinLength, kStringMaxLength, 0.0)); +} + +std::shared_ptr<BinaryArray> BuildSortedBinaryValues(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 4); + const auto length = StringLengthFromBytes(size_bytes); + const auto unique = std::max<int64_t>(1, length / 8); + auto values = std::static_pointer_cast<BinaryArray>( + rand.BinaryWithRepeats(length, unique, kStringMinLength, kStringMaxLength, 0.0)); + + std::vector<std::string> data; + data.reserve(static_cast<size_t>(values->length())); + for (int64_t index = 0; index < values->length(); ++index) { + data.emplace_back(values->GetView(index)); + } + std::ranges::sort(data); + + BinaryBuilder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<BinaryArray>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<BinaryArray> BuildBinaryNeedles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 5); + const auto length = std::max<int64_t>(1, StringLengthFromBytes(size_bytes) / 4); + const auto unique = std::max<int64_t>(1, length / 2); + return std::static_pointer_cast<BinaryArray>( + rand.BinaryWithRepeats(length, unique, kStringMinLength, kStringMaxLength, 0.0)); +} + +std::shared_ptr<Int64Array> BuildRunHeavyInt64Values(int64_t logical_length, + int64_t run_length) { + Int64Builder builder; + ABORT_NOT_OK(builder.Reserve(logical_length)); + for (int64_t index = 0; index < logical_length; ++index) { + builder.UnsafeAppend(index / run_length); + } + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildRunHeavyInt64NeedlesWithNullRuns(int64_t logical_length, + int64_t run_length) { + std::vector<int64_t> data(static_cast<size_t>(logical_length), 0); + std::vector<bool> is_valid(static_cast<size_t>(logical_length), true); + + for (int64_t index = 0; index < logical_length; ++index) { + const int64_t run_index = index / run_length; + if (run_index % 4 == 0) { + is_valid[static_cast<size_t>(index)] = false; + continue; + } + data[static_cast<size_t>(index)] = run_index / 2; + } + + Int64Builder builder; + ABORT_NOT_OK(builder.AppendValues(data, is_valid)); + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildInt64NeedlesWithNullRuns(int64_t size_bytes, + int64_t run_length) { + return BuildRunHeavyInt64NeedlesWithNullRuns(NeedleLengthFromBytes(size_bytes), + run_length); +} + +std::shared_ptr<Array> BuildRunEndEncodedInt64Values(int64_t size_bytes, + int64_t run_length) { + auto values = BuildRunHeavyInt64Values(Int64LengthFromBytes(size_bytes), run_length); + return RunEndEncode(Datum(values), RunEndEncodeOptions{int32()}) + .ValueOrDie() + .make_array(); +} + +std::shared_ptr<Array> BuildRunEndEncodedInt64Needles(int64_t size_bytes, + int64_t run_length) { + auto needles = BuildRunHeavyInt64Values(NeedleLengthFromBytes(size_bytes), run_length); + return RunEndEncode(Datum(needles), RunEndEncodeOptions{int32()}) + .ValueOrDie() + .make_array(); +} + +std::shared_ptr<Array> BuildRunEndEncodedInt64NeedlesWithNullRuns(int64_t size_bytes, + int64_t run_length) { + auto needles = BuildRunHeavyInt64NeedlesWithNullRuns(NeedleLengthFromBytes(size_bytes), + run_length); + return RunEndEncode(Datum(needles), RunEndEncodeOptions{int32()}) + .ValueOrDie() + .make_array(); +} + +void SetBenchmarkCounters(benchmark::State& state, const Datum& values, + const Datum& needles) { + const auto values_length = values.length(); + const auto needles_length = needles.length(); + state.counters["values_length"] = static_cast<double>(values_length); + state.counters["needles_length"] = static_cast<double>(needles_length); + state.SetItemsProcessed(state.iterations() * needles_length); +} + +void RunSearchSortedBenchmark(benchmark::State& state, const Datum& values, + const Datum& needles, SearchSortedOptions::Side side) { + const SearchSortedOptions options(side); + for (auto _ : state) { + auto result = SearchSorted(values, needles, options); + ABORT_NOT_OK(result.status()); + benchmark::DoNotOptimize(result.ValueUnsafe()); + } + SetBenchmarkCounters(state, values, needles); +} + +static void BM_SearchSortedInt64ArrayNeedles(benchmark::State& state, + SearchSortedOptions::Side side) { + const Datum values(BuildSortedInt64Values(state.range(0))); + const Datum needles(BuildInt64Needles(state.range(0))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedInt64ScalarNeedle(benchmark::State& state, Review Comment: If there is just one needle to search for, we're just benchmarking the function call overhead rather than any significant part of the sorted search kernel, right? Is it useful? (benchmarks for other compute functions focus on array performance, not scalar performance) ########## cpp/src/arrow/compute/kernels/vector_search_sorted_test.cc: ########## @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include <gtest/gtest.h> + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util_internal.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { +namespace { + +Result<std::shared_ptr<Array>> REEFromJSON(const std::shared_ptr<DataType>& ree_type, + const std::string& json) { + auto ree_type_ptr = checked_cast<const RunEndEncodedType*>(ree_type.get()); + auto array = ArrayFromJSON(ree_type_ptr->value_type(), json); + ARROW_ASSIGN_OR_RAISE( + auto datum, RunEndEncode(array, RunEndEncodeOptions{ree_type_ptr->run_end_type()})); + return datum.make_array(); +} + +void CheckSimpleSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needles_json, + const std::string& expected_left_json, + const std::string& expected_right_json) { + auto values = ArrayFromJSON(type, values_json); + auto needles = ArrayFromJSON(type, needles_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), expected_left_json), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), expected_right_json), *right.make_array()); +} + +void CheckSimpleScalarSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needle_json, uint64_t expected_left, + uint64_t expected_right) { + auto values = ArrayFromJSON(type, values_json); + auto needle = ScalarFromJSON(type, needle_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needle), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needle), + SearchSortedOptions(SearchSortedOptions::Right))); + + ASSERT_TRUE(left.is_scalar()); + ASSERT_TRUE(right.is_scalar()); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*left.scalar()).value, expected_left); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*right.scalar()).value, expected_right); +} + +struct SearchSortedSmokeCase { + std::string name; + std::shared_ptr<DataType> type; + std::string values_json; + std::string needles_json; + std::string expected_left_json; + std::string expected_right_json; + std::string scalar_needle_json; + uint64_t expected_scalar_left; + uint64_t expected_scalar_right; +}; + +std::vector<SearchSortedSmokeCase> SupportedTypeSmokeCases() { + return { + {"Boolean", boolean(), "[false, false, true, true]", "[false, true]", "[0, 2]", + "[2, 4]", "true", 2, 4}, + {"Int8", int8(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Int16", int16(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Int32", int32(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Int64", int64(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt8", uint8(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt16", uint16(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt32", uint32(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"UInt64", uint64(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Float32", float32(), "[1.0, 3.0, 3.0, 5.0]", "[0.0, 3.0, 4.0, 6.0]", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", "3.0", 1, 3}, + {"Float64", float64(), "[1.0, 3.0, 3.0, 5.0]", "[0.0, 3.0, 4.0, 6.0]", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", "3.0", 1, 3}, + {"Date32", date32(), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "3", 1, 3}, + {"Date64", date64(), "[86400000, 259200000, 259200000, 432000000]", + "[0, 259200000, 345600000, 518400000]", "[0, 1, 3, 4]", "[0, 3, 3, 4]", + "259200000", 1, 3}, + {"Time32", time32(TimeUnit::SECOND), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", + "[0, 3, 3, 4]", "3", 1, 3}, + {"Time64", time64(TimeUnit::NANO), "[1, 3, 3, 5]", "[0, 3, 4, 6]", "[0, 1, 3, 4]", + "[0, 3, 3, 4]", "3", 1, 3}, + {"Timestamp", timestamp(TimeUnit::SECOND), + R"(["1970-01-02", "1970-01-04", "1970-01-04", "1970-01-06"])", + R"(["1970-01-01", "1970-01-04", "1970-01-05", "1970-01-07"])", "[0, 1, 3, 4]", + "[0, 3, 3, 4]", R"("1970-01-04")", 1, 3}, + {"Duration", duration(TimeUnit::NANO), "[1, 3, 3, 5]", "[0, 3, 4, 6]", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", "3", 1, 3}, + {"Binary", binary(), R"(["aa", "bb", "bb", "dd"])", R"(["a", "bb", "bc", "z"])", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"String", utf8(), R"(["aa", "bb", "bb", "dd"])", R"(["a", "bb", "bc", "z"])", + "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"LargeBinary", large_binary(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"LargeString", large_utf8(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"BinaryView", binary_view(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + {"StringView", utf8_view(), R"(["aa", "bb", "bb", "dd"])", + R"(["a", "bb", "bc", "z"])", "[0, 1, 3, 4]", "[0, 3, 3, 4]", R"("bb")", 1, 3}, + }; +} + +class SearchSortedSupportedTypesTest + : public ::testing::TestWithParam<SearchSortedSmokeCase> {}; + +TEST(SearchSorted, BasicLeftRight) { + auto values = ArrayFromJSON(int64(), "[100, 200, 200, 300, 300]"); + auto needles = ArrayFromJSON(int64(), "[50, 200, 250, 400]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 1, 3, 5]"), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 3, 3, 5]"), *right.make_array()); +} + +TEST(SearchSorted, ScalarNeedle) { + auto values = ArrayFromJSON(int32(), "[1, 3, 5, 7]"); + + ASSERT_OK_AND_ASSIGN( + auto result, SearchSorted(Datum(values), Datum(std::make_shared<Int32Scalar>(5)), + SearchSortedOptions(SearchSortedOptions::Right))); + + ASSERT_TRUE(result.is_scalar()); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*result.scalar()).value, 3); +} + +TEST(SearchSorted, ScalarStringNeedle) { + auto values = ArrayFromJSON(utf8(), R"(["aa", "bb", "bb", "cc"])"); + + ASSERT_OK_AND_ASSIGN( + auto result, + SearchSorted(Datum(values), Datum(std::make_shared<StringScalar>("bb")), + SearchSortedOptions(SearchSortedOptions::Right))); + + ASSERT_TRUE(result.is_scalar()); + ASSERT_EQ(checked_cast<const UInt64Scalar&>(*result.scalar()).value, 3); +} + +TEST(SearchSorted, EmptyHaystack) { + auto values = ArrayFromJSON(int16(), "[]"); + auto needles = ArrayFromJSON(int16(), "[1, 2, 3]"); + + ASSERT_OK_AND_ASSIGN(auto result, SearchSorted(Datum(values), Datum(needles))); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 0, 0]"), *result.make_array()); +} + +TEST(SearchSorted, ValuesWithLeadingNulls) { + auto values = ArrayFromJSON(int32(), "[null, 200, 300, 300]"); + auto needles = ArrayFromJSON(int32(), "[50, 200, 250, 400]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[1, 1, 2, 4]"), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[1, 2, 2, 4]"), *right.make_array()); +} + +TEST(SearchSorted, ValuesAllNull) { + auto values = ArrayFromJSON(int32(), "[null, null, null]"); + auto needles = ArrayFromJSON(int32(), "[50, 200, null]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[3, 3, null]"), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[3, 3, null]"), *right.make_array()); +} + +TEST(SearchSorted, ValuesWithTrailingNulls) { + auto values = ArrayFromJSON(int32(), "[200, 300, 300, null, null]"); + auto needles = ArrayFromJSON(int32(), "[50, 200, 250, 400]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 0, 1, 3]"), *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 1, 1, 3]"), *right.make_array()); +} + +TEST(SearchSorted, NullNeedlesEmitNull) { + auto values = ArrayFromJSON(int32(), "[null, 200, 300, 300]"); + auto needles = ArrayFromJSON(int32(), "[null, 50, 200, null, 400]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[null, 1, 1, null, 4]"), + *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[null, 1, 2, null, 4]"), + *right.make_array()); + + ASSERT_OK_AND_ASSIGN(auto scalar_result, + SearchSorted(Datum(values), Datum(std::make_shared<Int32Scalar>()), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_TRUE(scalar_result.is_scalar()); + ASSERT_FALSE(scalar_result.scalar()->is_valid); + ASSERT_TRUE(scalar_result.scalar()->type->Equals(uint64())); +} + +TEST(SearchSorted, RejectUnclusteredNullValues) { + auto values = ArrayFromJSON(int32(), "[null, 1, null, 3]"); + auto needles = ArrayFromJSON(int32(), "[2]"); + + ASSERT_RAISES(Invalid, SearchSorted(Datum(values), Datum(needles))); +} + +TEST(SearchSorted, RunEndEncodedNulls) { + auto values_type = run_end_encoded(int16(), int32()); + ASSERT_OK_AND_ASSIGN(auto ree_values, + REEFromJSON(values_type, "[null, null, 2, 4, 4]")); + auto needles_type = run_end_encoded(int16(), int32()); + ASSERT_OK_AND_ASSIGN(auto ree_needles, + REEFromJSON(needles_type, "[null, null, 1, 4, 4, null, 8]")); + + ASSERT_OK_AND_ASSIGN(auto result, + SearchSorted(Datum(ree_values), Datum(ree_needles), + SearchSortedOptions(SearchSortedOptions::Left))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[null, null, 2, 3, 3, null, 5]"), + *result.make_array()); +} + +TEST(SearchSorted, RunEndEncodedNeedlesWithNullRuns) { + auto values = ArrayFromJSON(int32(), "[1, 1, 3, 5, 8]"); + auto needles_type = run_end_encoded(int32(), int32()); + ASSERT_OK_AND_ASSIGN( + auto ree_needles, + REEFromJSON(needles_type, "[null, null, 0, 0, 0, 1, 1, 4, 4, 4, null, 9, 9]")); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(ree_needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(ree_needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual( + *ArrayFromJSON(uint64(), "[null, null, 0, 0, 0, 0, 0, 3, 3, 3, null, 5, 5]"), + *left.make_array()); + AssertArraysEqual( + *ArrayFromJSON(uint64(), "[null, null, 0, 0, 0, 2, 2, 3, 3, 3, null, 5, 5]"), + *right.make_array()); +} + +TEST(SearchSorted, RunEndEncodedAllNullValues) { + auto values_type = run_end_encoded(int16(), int32()); + ASSERT_OK_AND_ASSIGN(auto ree_values, + REEFromJSON(values_type, "[null, null, null, null]")); + auto needles = ArrayFromJSON(int32(), "[null, 1, 8]"); + + ASSERT_OK_AND_ASSIGN(auto result, + SearchSorted(Datum(ree_values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[null, 4, 4]"), *result.make_array()); +} + +TEST(SearchSorted, RejectMismatchedTypes) { + auto values = ArrayFromJSON(int32(), "[1, 2, 3]"); + auto needles = ArrayFromJSON(int64(), "[2]"); + + ASSERT_RAISES(TypeError, SearchSorted(Datum(values), Datum(needles))); +} + +TEST(SearchSorted, RunEndEncodedValues) { + auto values_type = run_end_encoded(int16(), int32()); + ASSERT_OK_AND_ASSIGN(auto ree_values, REEFromJSON(values_type, "[1, 1, 1, 3, 3, 5]")); + auto needles = ArrayFromJSON(int32(), "[0, 1, 2, 3, 4, 5, 6]"); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(ree_values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(ree_values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 0, 3, 3, 5, 5, 6]"), + *left.make_array()); + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 3, 3, 5, 5, 6, 6]"), + *right.make_array()); +} + +TEST(SearchSorted, RunEndEncodedNeedles) { + auto values = ArrayFromJSON(int32(), "[1, 1, 3, 5, 8]"); + auto needles_type = run_end_encoded(int32(), int32()); + ASSERT_OK_AND_ASSIGN(auto ree_needles, + REEFromJSON(needles_type, "[0, 0, 1, 1, 4, 4, 9]")); + + ASSERT_OK_AND_ASSIGN(auto result, + SearchSorted(Datum(values), Datum(ree_needles), + SearchSortedOptions(SearchSortedOptions::Right))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 0, 2, 2, 3, 3, 5]"), + *result.make_array()); +} + +TEST(SearchSorted, SlicedRunEndEncodedValues) { + auto values_type = run_end_encoded(int32(), int32()); + ASSERT_OK_AND_ASSIGN(auto ree_values, + REEFromJSON(values_type, "[0, 0, 1, 1, 1, 4, 4, 9]")); + auto sliced = ree_values->Slice(2, 5); + auto needles = ArrayFromJSON(int32(), "[0, 1, 2, 4, 9]"); + + ASSERT_OK_AND_ASSIGN(auto result, + SearchSorted(Datum(sliced), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 0, 3, 3, 5]"), *result.make_array()); +} + +TEST(SearchSorted, BinaryValues) { + auto values = ArrayFromJSON(utf8(), R"(["aa", "bb", "bb", "cc"])"); + auto needles = ArrayFromJSON(utf8(), R"(["a", "bb", "bc", "z"])"); + + ASSERT_OK_AND_ASSIGN(auto result, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + + AssertArraysEqual(*ArrayFromJSON(uint64(), "[0, 1, 3, 4]"), *result.make_array()); +} + Review Comment: Can we add tests for chunked arrays? (and, if not currently supported, support them :-)) ########## cpp/src/arrow/compute/kernels/vector_search_sorted_test.cc: ########## @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include <gtest/gtest.h> + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util_internal.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { +namespace { + +Result<std::shared_ptr<Array>> REEFromJSON(const std::shared_ptr<DataType>& ree_type, + const std::string& json) { + auto ree_type_ptr = checked_cast<const RunEndEncodedType*>(ree_type.get()); + auto array = ArrayFromJSON(ree_type_ptr->value_type(), json); + ARROW_ASSIGN_OR_RAISE( + auto datum, RunEndEncode(array, RunEndEncodeOptions{ree_type_ptr->run_end_type()})); + return datum.make_array(); +} + +void CheckSimpleSearchSorted(const std::shared_ptr<DataType>& type, + const std::string& values_json, + const std::string& needles_json, + const std::string& expected_left_json, + const std::string& expected_right_json) { + auto values = ArrayFromJSON(type, values_json); + auto needles = ArrayFromJSON(type, needles_json); + + ASSERT_OK_AND_ASSIGN(auto left, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Left))); + ASSERT_OK_AND_ASSIGN(auto right, + SearchSorted(Datum(values), Datum(needles), + SearchSortedOptions(SearchSortedOptions::Right))); Review Comment: Let's call `ValidateFull()` on both results? ########## cpp/src/arrow/compute/kernels/vector_search_sorted_benchmark.cc: ########## @@ -0,0 +1,382 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/datum.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { + +constexpr auto kSeed = 0x5EA4C42; +constexpr int64_t kNeedleToValueRatio = 4; +constexpr int64_t kValuesRunLength = 16; +constexpr int64_t kNeedlesRunLength = 8; +constexpr int32_t kStringMinLength = 8; +constexpr int32_t kStringMaxLength = 24; + +void SetSearchSortedQuickArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : std::vector<int64_t>{kL1Size, kL2Size}) { + bench->Arg(size); + } +} + +void SetSearchSortedArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : kMemorySizes) { Review Comment: Which essentially means that all benchmarks become "quick" as per the definition of quick here :) ########## cpp/src/arrow/compute/kernels/vector_search_sorted_benchmark.cc: ########## @@ -0,0 +1,382 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/datum.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { + +constexpr auto kSeed = 0x5EA4C42; +constexpr int64_t kNeedleToValueRatio = 4; +constexpr int64_t kValuesRunLength = 16; +constexpr int64_t kNeedlesRunLength = 8; +constexpr int32_t kStringMinLength = 8; +constexpr int32_t kStringMaxLength = 24; + +void SetSearchSortedQuickArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : std::vector<int64_t>{kL1Size, kL2Size}) { + bench->Arg(size); + } +} + +void SetSearchSortedArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : kMemorySizes) { Review Comment: Unfortunately the largest size produces rather slow benchmark iterations, can we just keep `{kL1Size, kL2Size}}`? ########## cpp/src/arrow/compute/kernels/vector_search_sorted_benchmark.cc: ########## @@ -0,0 +1,382 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/datum.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { + +constexpr auto kSeed = 0x5EA4C42; +constexpr int64_t kNeedleToValueRatio = 4; +constexpr int64_t kValuesRunLength = 16; +constexpr int64_t kNeedlesRunLength = 8; +constexpr int32_t kStringMinLength = 8; +constexpr int32_t kStringMaxLength = 24; + +void SetSearchSortedQuickArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : std::vector<int64_t>{kL1Size, kL2Size}) { + bench->Arg(size); + } +} + +void SetSearchSortedArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : kMemorySizes) { + bench->Arg(size); + } +} + +int64_t Int64LengthFromBytes(int64_t size_bytes) { + return std::max<int64_t>(1, size_bytes / static_cast<int64_t>(sizeof(int64_t))); +} + +int64_t NeedleLengthFromBytes(int64_t size_bytes) { + return std::max<int64_t>(1, Int64LengthFromBytes(size_bytes) / kNeedleToValueRatio); +} + +int64_t StringLengthFromBytes(int64_t size_bytes) { + const int64_t average_length = (kStringMinLength + kStringMaxLength) / 2; + return std::max<int64_t>(1, size_bytes / average_length); +} + +std::shared_ptr<Int64Array> BuildSortedInt64Values(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed); + const auto length = Int64LengthFromBytes(size_bytes); + const auto max_value = std::max<int64_t>(length / 8, 1); + + auto values = + std::static_pointer_cast<Int64Array>(rand.Int64(length, 0, max_value, 0.0)); + std::vector<int64_t> data(values->raw_values(), + values->raw_values() + values->length()); + std::ranges::sort(data); + + Int64Builder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildInt64Needles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 1); + const auto length = NeedleLengthFromBytes(size_bytes); + const auto max_value = std::max<int64_t>(Int64LengthFromBytes(size_bytes) / 8, 1); + return std::static_pointer_cast<Int64Array>(rand.Int64(length, 0, max_value, 0.0)); +} + +std::shared_ptr<StringArray> BuildSortedStringValues(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 2); + const auto length = StringLengthFromBytes(size_bytes); + auto values = std::static_pointer_cast<StringArray>( + rand.String(length, kStringMinLength, kStringMaxLength, 0.0)); + + std::vector<std::string> data; + data.reserve(static_cast<size_t>(values->length())); + for (int64_t index = 0; index < values->length(); ++index) { + data.push_back(values->GetString(index)); + } + std::ranges::sort(data); + + StringBuilder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<StringArray>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<StringArray> BuildStringNeedles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 3); + const auto length = std::max<int64_t>(1, StringLengthFromBytes(size_bytes) / 4); + return std::static_pointer_cast<StringArray>( + rand.String(length, kStringMinLength, kStringMaxLength, 0.0)); +} + +std::shared_ptr<BinaryArray> BuildSortedBinaryValues(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 4); + const auto length = StringLengthFromBytes(size_bytes); + const auto unique = std::max<int64_t>(1, length / 8); + auto values = std::static_pointer_cast<BinaryArray>( + rand.BinaryWithRepeats(length, unique, kStringMinLength, kStringMaxLength, 0.0)); + + std::vector<std::string> data; + data.reserve(static_cast<size_t>(values->length())); + for (int64_t index = 0; index < values->length(); ++index) { + data.emplace_back(values->GetView(index)); + } + std::ranges::sort(data); + + BinaryBuilder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<BinaryArray>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<BinaryArray> BuildBinaryNeedles(int64_t size_bytes) { Review Comment: I don't think it's worth benchmarking both string and binary, as they are expected to perform similarly. ########## cpp/src/arrow/compute/kernels/vector_search_sorted_benchmark.cc: ########## @@ -0,0 +1,382 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/datum.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { + +constexpr auto kSeed = 0x5EA4C42; +constexpr int64_t kNeedleToValueRatio = 4; +constexpr int64_t kValuesRunLength = 16; +constexpr int64_t kNeedlesRunLength = 8; +constexpr int32_t kStringMinLength = 8; +constexpr int32_t kStringMaxLength = 24; + +void SetSearchSortedQuickArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : std::vector<int64_t>{kL1Size, kL2Size}) { + bench->Arg(size); + } +} + +void SetSearchSortedArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + for (const auto size : kMemorySizes) { + bench->Arg(size); + } +} + +int64_t Int64LengthFromBytes(int64_t size_bytes) { + return std::max<int64_t>(1, size_bytes / static_cast<int64_t>(sizeof(int64_t))); +} + +int64_t NeedleLengthFromBytes(int64_t size_bytes) { + return std::max<int64_t>(1, Int64LengthFromBytes(size_bytes) / kNeedleToValueRatio); +} + +int64_t StringLengthFromBytes(int64_t size_bytes) { + const int64_t average_length = (kStringMinLength + kStringMaxLength) / 2; + return std::max<int64_t>(1, size_bytes / average_length); +} + +std::shared_ptr<Int64Array> BuildSortedInt64Values(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed); + const auto length = Int64LengthFromBytes(size_bytes); + const auto max_value = std::max<int64_t>(length / 8, 1); + + auto values = + std::static_pointer_cast<Int64Array>(rand.Int64(length, 0, max_value, 0.0)); + std::vector<int64_t> data(values->raw_values(), + values->raw_values() + values->length()); + std::ranges::sort(data); + + Int64Builder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildInt64Needles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 1); + const auto length = NeedleLengthFromBytes(size_bytes); + const auto max_value = std::max<int64_t>(Int64LengthFromBytes(size_bytes) / 8, 1); + return std::static_pointer_cast<Int64Array>(rand.Int64(length, 0, max_value, 0.0)); +} + +std::shared_ptr<StringArray> BuildSortedStringValues(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 2); + const auto length = StringLengthFromBytes(size_bytes); + auto values = std::static_pointer_cast<StringArray>( + rand.String(length, kStringMinLength, kStringMaxLength, 0.0)); + + std::vector<std::string> data; + data.reserve(static_cast<size_t>(values->length())); + for (int64_t index = 0; index < values->length(); ++index) { + data.push_back(values->GetString(index)); + } + std::ranges::sort(data); + + StringBuilder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<StringArray>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<StringArray> BuildStringNeedles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 3); + const auto length = std::max<int64_t>(1, StringLengthFromBytes(size_bytes) / 4); + return std::static_pointer_cast<StringArray>( + rand.String(length, kStringMinLength, kStringMaxLength, 0.0)); +} + +std::shared_ptr<BinaryArray> BuildSortedBinaryValues(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 4); + const auto length = StringLengthFromBytes(size_bytes); + const auto unique = std::max<int64_t>(1, length / 8); + auto values = std::static_pointer_cast<BinaryArray>( + rand.BinaryWithRepeats(length, unique, kStringMinLength, kStringMaxLength, 0.0)); + + std::vector<std::string> data; + data.reserve(static_cast<size_t>(values->length())); + for (int64_t index = 0; index < values->length(); ++index) { + data.emplace_back(values->GetView(index)); + } + std::ranges::sort(data); + + BinaryBuilder builder; + ABORT_NOT_OK(builder.AppendValues(data)); + return std::static_pointer_cast<BinaryArray>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<BinaryArray> BuildBinaryNeedles(int64_t size_bytes) { + random::RandomArrayGenerator rand(kSeed + 5); + const auto length = std::max<int64_t>(1, StringLengthFromBytes(size_bytes) / 4); + const auto unique = std::max<int64_t>(1, length / 2); + return std::static_pointer_cast<BinaryArray>( + rand.BinaryWithRepeats(length, unique, kStringMinLength, kStringMaxLength, 0.0)); +} + +std::shared_ptr<Int64Array> BuildRunHeavyInt64Values(int64_t logical_length, + int64_t run_length) { + Int64Builder builder; + ABORT_NOT_OK(builder.Reserve(logical_length)); + for (int64_t index = 0; index < logical_length; ++index) { + builder.UnsafeAppend(index / run_length); + } + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildRunHeavyInt64NeedlesWithNullRuns(int64_t logical_length, + int64_t run_length) { + std::vector<int64_t> data(static_cast<size_t>(logical_length), 0); + std::vector<bool> is_valid(static_cast<size_t>(logical_length), true); + + for (int64_t index = 0; index < logical_length; ++index) { + const int64_t run_index = index / run_length; + if (run_index % 4 == 0) { + is_valid[static_cast<size_t>(index)] = false; + continue; + } + data[static_cast<size_t>(index)] = run_index / 2; + } + + Int64Builder builder; + ABORT_NOT_OK(builder.AppendValues(data, is_valid)); + return std::static_pointer_cast<Int64Array>(builder.Finish().ValueOrDie()); +} + +std::shared_ptr<Int64Array> BuildInt64NeedlesWithNullRuns(int64_t size_bytes, + int64_t run_length) { + return BuildRunHeavyInt64NeedlesWithNullRuns(NeedleLengthFromBytes(size_bytes), + run_length); +} + +std::shared_ptr<Array> BuildRunEndEncodedInt64Values(int64_t size_bytes, + int64_t run_length) { + auto values = BuildRunHeavyInt64Values(Int64LengthFromBytes(size_bytes), run_length); + return RunEndEncode(Datum(values), RunEndEncodeOptions{int32()}) + .ValueOrDie() + .make_array(); +} + +std::shared_ptr<Array> BuildRunEndEncodedInt64Needles(int64_t size_bytes, + int64_t run_length) { + auto needles = BuildRunHeavyInt64Values(NeedleLengthFromBytes(size_bytes), run_length); + return RunEndEncode(Datum(needles), RunEndEncodeOptions{int32()}) + .ValueOrDie() + .make_array(); +} + +std::shared_ptr<Array> BuildRunEndEncodedInt64NeedlesWithNullRuns(int64_t size_bytes, + int64_t run_length) { + auto needles = BuildRunHeavyInt64NeedlesWithNullRuns(NeedleLengthFromBytes(size_bytes), + run_length); + return RunEndEncode(Datum(needles), RunEndEncodeOptions{int32()}) + .ValueOrDie() + .make_array(); +} + +void SetBenchmarkCounters(benchmark::State& state, const Datum& values, + const Datum& needles) { + const auto values_length = values.length(); + const auto needles_length = needles.length(); + state.counters["values_length"] = static_cast<double>(values_length); + state.counters["needles_length"] = static_cast<double>(needles_length); + state.SetItemsProcessed(state.iterations() * needles_length); +} + +void RunSearchSortedBenchmark(benchmark::State& state, const Datum& values, + const Datum& needles, SearchSortedOptions::Side side) { + const SearchSortedOptions options(side); + for (auto _ : state) { + auto result = SearchSorted(values, needles, options); + ABORT_NOT_OK(result.status()); + benchmark::DoNotOptimize(result.ValueUnsafe()); + } + SetBenchmarkCounters(state, values, needles); +} + +static void BM_SearchSortedInt64ArrayNeedles(benchmark::State& state, + SearchSortedOptions::Side side) { + const Datum values(BuildSortedInt64Values(state.range(0))); + const Datum needles(BuildInt64Needles(state.range(0))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedInt64ScalarNeedle(benchmark::State& state, + SearchSortedOptions::Side side) { + const auto values_array = BuildSortedInt64Values(state.range(0)); + const auto scalar_index = values_array->length() / 2; + const Datum values(values_array); + const Datum needles(std::make_shared<Int64Scalar>(values_array->Value(scalar_index))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedRunEndEncodedValues(benchmark::State& state, + SearchSortedOptions::Side side) { + const Datum values(BuildRunEndEncodedInt64Values(state.range(0), kValuesRunLength)); + const Datum needles(BuildInt64Needles(state.range(0))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedRunEndEncodedValuesAndNeedles(benchmark::State& state, + SearchSortedOptions::Side side) { + const Datum values(BuildRunEndEncodedInt64Values(state.range(0), kValuesRunLength)); + const Datum needles(BuildRunEndEncodedInt64Needles(state.range(0), kNeedlesRunLength)); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedInt64NeedlesWithNullRuns(benchmark::State& state, + SearchSortedOptions::Side side) { + const Datum values(BuildSortedInt64Values(state.range(0))); + const Datum needles(BuildInt64NeedlesWithNullRuns(state.range(0), kNeedlesRunLength)); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedRunEndEncodedNeedlesWithNullRuns( + benchmark::State& state, SearchSortedOptions::Side side) { + const Datum values(BuildRunEndEncodedInt64Values(state.range(0), kValuesRunLength)); + const Datum needles( + BuildRunEndEncodedInt64NeedlesWithNullRuns(state.range(0), kNeedlesRunLength)); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedStringArrayNeedles(benchmark::State& state, + SearchSortedOptions::Side side) { + const Datum values(BuildSortedStringValues(state.range(0))); + const Datum needles(BuildStringNeedles(state.range(0))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedStringScalarNeedle(benchmark::State& state, + SearchSortedOptions::Side side) { + const auto values_array = BuildSortedStringValues(state.range(0)); + const auto scalar_index = values_array->length() / 2; + const Datum values(values_array); + const Datum needles( + std::make_shared<StringScalar>(values_array->GetString(scalar_index))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedBinaryScalarNeedle(benchmark::State& state, + SearchSortedOptions::Side side) { + const auto values_array = BuildSortedBinaryValues(state.range(0)); + const auto scalar_index = values_array->length() / 2; + const Datum values(values_array); + const Datum needles( + std::make_shared<BinaryScalar>(std::string(values_array->GetView(scalar_index)))); + RunSearchSortedBenchmark(state, values, needles, side); +} + +static void BM_SearchSortedInt64ArrayNeedlesQuick(benchmark::State& state, + SearchSortedOptions::Side side) { + BM_SearchSortedInt64ArrayNeedles(state, side); +} + +static void BM_SearchSortedRunEndEncodedValuesAndNeedlesQuick( + benchmark::State& state, SearchSortedOptions::Side side) { + BM_SearchSortedRunEndEncodedValuesAndNeedles(state, side); +} + +static void BM_SearchSortedInt64NeedlesWithNullRunsQuick(benchmark::State& state, + SearchSortedOptions::Side side) { + BM_SearchSortedInt64NeedlesWithNullRuns(state, side); +} + +static void BM_SearchSortedRunEndEncodedNeedlesWithNullRunsQuick( + benchmark::State& state, SearchSortedOptions::Side side) { + BM_SearchSortedRunEndEncodedNeedlesWithNullRuns(state, side); +} + +// Primitive-array and REE cases are the main baselines for the kernel TODOs around +// SIMD batched search, vectorized REE writeback, and future parallel needle traversal. + +BENCHMARK_CAPTURE(BM_SearchSortedInt64ArrayNeedles, left, SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedInt64ArrayNeedles, right, SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedInt64ScalarNeedle, left, SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedInt64ScalarNeedle, right, SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedRunEndEncodedValues, left, SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedRunEndEncodedValues, right, SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedRunEndEncodedValuesAndNeedles, left, + SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedRunEndEncodedValuesAndNeedles, right, + SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedInt64NeedlesWithNullRuns, left, + SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedInt64NeedlesWithNullRuns, right, + SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedRunEndEncodedNeedlesWithNullRuns, left, + SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedRunEndEncodedNeedlesWithNullRuns, right, + SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedStringArrayNeedles, left, SearchSortedOptions::Left) + ->Apply(SetSearchSortedArgs); +BENCHMARK_CAPTURE(BM_SearchSortedStringArrayNeedles, right, SearchSortedOptions::Right) + ->Apply(SetSearchSortedArgs); + +// String and binary scalar cases specifically exercise the direct scalar fast path that +// avoids boxing a scalar needle into a temporary one-element array. Review Comment: Even if we wanted to benchmark this (which I doubt we do), we would only need a single benchmark IMHO. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
