pitrou commented on code in PR #13487:
URL: https://github.com/apache/arrow/pull/13487#discussion_r964908299
##########
cpp/src/arrow/compute/api_scalar.cc:
##########
@@ -858,6 +858,12 @@ Result<Datum> MapLookup(const Datum& arg, MapLookupOptions
options, ExecContext*
return CallFunction("map_lookup", {arg}, &options, ctx);
}
+// ----------------------------------------------------------------------
+// Hash functions
+Result<Datum> FastHash64(const Datum& input_array, ExecContext* ctx) {
+ return CallFunction("fast_hash_64", {input_array}, ctx);
Review Comment:
The prefix "fast" doesn't add any useful information here, and you'll be
embarassed if you want to propose another faster hash function. Let's just call
it "hash_64"?
##########
cpp/src/arrow/util/hashing_benchmark.cc:
##########
@@ -111,13 +123,238 @@ static void HashLargeStrings(benchmark::State& state) {
// NOLINT non-const ref
BenchmarkStringHashing(state, values);
}
+static void HashIntegers32(benchmark::State& state) { // NOLINT non-const
reference
+ const std::vector<int32_t> values = MakeIntegers<int32_t>(10000);
+
+ while (state.KeepRunning()) {
+ hash_t total = 0;
+ for (const int32_t v : values) {
+ total += ScalarHelper<int32_t, 0>::ComputeHash(v);
+ total += ScalarHelper<int32_t, 1>::ComputeHash(v);
+ }
+ benchmark::DoNotOptimize(total);
+ }
+ state.SetBytesProcessed(2 * state.iterations() * values.size() *
sizeof(int32_t));
+ state.SetItemsProcessed(2 * state.iterations() * values.size());
+}
+
+static void HashIntegers64(benchmark::State& state) { // NOLINT non-const
reference
+ const std::vector<int64_t> values = MakeIntegers<int64_t>(10000);
+
+ while (state.KeepRunning()) {
+ hash_t total = 0;
+ for (const int64_t v : values) {
+ total += ScalarHelper<int64_t, 0>::ComputeHash(v);
+ total += ScalarHelper<int64_t, 1>::ComputeHash(v);
+ }
+ benchmark::DoNotOptimize(total);
+ }
+ state.SetBytesProcessed(2 * state.iterations() * values.size() *
sizeof(int64_t));
+ state.SetItemsProcessed(2 * state.iterations() * values.size());
+}
+
+static void KeyHashIntegers32(benchmark::State& state) { // NOLINT non-const
reference
+ auto test_vals = hashing_rng.Int32(10000, 0,
std::numeric_limits<int32_t>::max());
+
+ // initialize the stack allocator
+ util::TempVectorStack stack_memallocator;
+ ASSERT_OK(
+ stack_memallocator.Init(compute::default_exec_context()->memory_pool(),
+ 3 * sizeof(int32_t) *
util::MiniBatch::kMiniBatchLength));
+
+ // prepare the execution context for Hashing32
+ compute::LightContext hash_ctx;
+ hash_ctx.hardware_flags =
compute::default_exec_context()->cpu_info()->hardware_flags();
+ hash_ctx.stack = &stack_memallocator;
+
+ // allocate memory for results
+ ASSERT_OK_AND_ASSIGN(std::unique_ptr<Buffer> hash_buffer,
+ AllocateBuffer(test_vals->length() * sizeof(int32_t)));
+
+ // run the benchmark
+ while (state.KeepRunning()) {
+ // Prepare input data structure for propagation to hash function
+ ASSERT_OK_AND_ASSIGN(
+ compute::KeyColumnArray input_keycol,
+ compute::ColumnArrayFromArrayData(test_vals->data(), 0,
test_vals->length()));
+
+ compute::Hashing32::HashMultiColumn(
+ {input_keycol}, &hash_ctx,
+ reinterpret_cast<uint32_t*>(hash_buffer->mutable_data()));
+
+ // benchmark::DoNotOptimize(hash_buffer);
+ }
+
+ state.SetBytesProcessed(state.iterations() * test_vals->length() *
sizeof(int32_t));
+ state.SetItemsProcessed(state.iterations() * test_vals->length());
+}
+
+static void KeyHashIntegers64(benchmark::State& state) { // NOLINT non-const
reference
+ auto test_vals = hashing_rng.Int64(10000, 0,
std::numeric_limits<int64_t>::max());
+
+ // initialize the stack allocator
+ util::TempVectorStack stack_memallocator;
+ ASSERT_OK(
+ stack_memallocator.Init(compute::default_exec_context()->memory_pool(),
+ 3 * sizeof(int32_t) *
util::MiniBatch::kMiniBatchLength));
+
+ // prepare the execution context for Hashing32
+ compute::LightContext hash_ctx;
+ hash_ctx.hardware_flags =
compute::default_exec_context()->cpu_info()->hardware_flags();
+ hash_ctx.stack = &stack_memallocator;
+
+ // allocate memory for results
+ ASSERT_OK_AND_ASSIGN(std::unique_ptr<Buffer> hash_buffer,
+ AllocateBuffer(test_vals->length() * sizeof(int64_t)));
+
+ // run the benchmark
+ while (state.KeepRunning()) {
+ // Prepare input data structure for propagation to hash function
+ ASSERT_OK_AND_ASSIGN(
+ compute::KeyColumnArray input_keycol,
+ compute::ColumnArrayFromArrayData(test_vals->data(), 0,
test_vals->length()));
+
+ compute::Hashing64::HashMultiColumn(
+ {input_keycol}, &hash_ctx,
+ reinterpret_cast<uint64_t*>(hash_buffer->mutable_data()));
+
+ // benchmark::DoNotOptimize(hash_buffer);
+ }
+
+ state.SetBytesProcessed(state.iterations() * test_vals->length() *
sizeof(int64_t));
+ state.SetItemsProcessed(state.iterations() * test_vals->length());
+}
+
+static void FastHash64Int64(benchmark::State& state) { // NOLINT non-const
reference
+ auto test_vals = hashing_rng.Int64(10000, 0,
std::numeric_limits<int64_t>::max());
+
+ while (state.KeepRunning()) {
+ ASSERT_OK_AND_ASSIGN(Datum hash_result,
+ compute::CallFunction("fast_hash_64", {test_vals}));
+ benchmark::DoNotOptimize(hash_result);
+ }
+
+ state.SetBytesProcessed(state.iterations() * test_vals->length() *
sizeof(int64_t));
+ state.SetItemsProcessed(state.iterations() * test_vals->length());
+}
+
+static void FastHash64StructSmallStrings(
+ benchmark::State& state) { // NOLINT non-const reference
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<StructArray> values_array,
+ MakeStructArray(10000, 2, 20));
+
+ // 2nd column (index 1) is a string column, which has offset type of int32_t
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> values_second,
+ values_array->GetFlattenedField(1));
+ std::shared_ptr<StringArray> str_vals =
+ std::static_pointer_cast<StringArray>(values_second);
+ int32_t total_string_size = str_vals->total_values_length();
+
+ while (state.KeepRunning()) {
+ ASSERT_OK_AND_ASSIGN(Datum hash_result,
+ compute::CallFunction("fast_hash_64",
{values_array}));
+ benchmark::DoNotOptimize(hash_result);
+ }
+
+ state.SetBytesProcessed(state.iterations() *
+ ((values_array->length() * sizeof(int64_t)) +
+ (total_string_size) +
+ (values_array->length() * sizeof(int64_t))));
+ state.SetItemsProcessed(state.iterations() * 3 * values_array->length());
+}
+
+static void FastHash64StructMediumStrings(
+ benchmark::State& state) { // NOLINT non-const reference
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<StructArray> values_array,
+ MakeStructArray(10000, 20, 120));
+
+ // 2nd column (index 1) is a string column, which has offset type of int32_t
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> values_second,
+ values_array->GetFlattenedField(1));
+ std::shared_ptr<StringArray> str_vals =
+ std::static_pointer_cast<StringArray>(values_second);
+ int32_t total_string_size = str_vals->total_values_length();
+
+ while (state.KeepRunning()) {
+ ASSERT_OK_AND_ASSIGN(Datum hash_result,
+ compute::CallFunction("fast_hash_64",
{values_array}));
+ benchmark::DoNotOptimize(hash_result);
+ }
+
+ state.SetBytesProcessed(state.iterations() *
+ ((values_array->length() * sizeof(int64_t)) +
+ (total_string_size) +
+ (values_array->length() * sizeof(int64_t))));
+ state.SetItemsProcessed(state.iterations() * 3 * values_array->length());
+}
+
+static void FastHash64StructLargeStrings(
+ benchmark::State& state) { // NOLINT non-const reference
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<StructArray> values_array,
+ MakeStructArray(10000, 120, 2000));
+
+ // 2nd column (index 1) is a string column, which has offset type of int32_t
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> values_second,
+ values_array->GetFlattenedField(1));
+ std::shared_ptr<StringArray> str_vals =
+ std::static_pointer_cast<StringArray>(values_second);
+ int32_t total_string_size = str_vals->total_values_length();
+
+ while (state.KeepRunning()) {
+ ASSERT_OK_AND_ASSIGN(Datum hash_result,
+ compute::CallFunction("fast_hash_64",
{values_array}));
+ benchmark::DoNotOptimize(hash_result);
+ }
+
+ state.SetBytesProcessed(state.iterations() *
+ ((values_array->length() * sizeof(int64_t)) +
+ (total_string_size) +
+ (values_array->length() * sizeof(int64_t))));
+ state.SetItemsProcessed(state.iterations() * 3 * values_array->length());
+}
+
+static void FastHash64Map(benchmark::State& state) { // NOLINT non-const
reference
+ constexpr int64_t test_size = 10000;
+ auto test_keys = hashing_rng.String(test_size, 2, 20,
/*null_probability=*/0);
+ auto test_vals = hashing_rng.Int64(test_size, 0,
std::numeric_limits<int64_t>::max());
+ auto test_keyvals = hashing_rng.Map(test_keys, test_vals, test_size);
+
+ auto key_arr = std::static_pointer_cast<StringArray>(test_keys);
+ int32_t total_key_size = key_arr->total_values_length();
+ int32_t total_val_size = test_size * sizeof(int64_t);
+
+ while (state.KeepRunning()) {
+ ASSERT_OK_AND_ASSIGN(Datum hash_result,
+ compute::CallFunction("fast_hash_64",
{test_keyvals}));
+ benchmark::DoNotOptimize(hash_result);
+ }
+
+ state.SetBytesProcessed(state.iterations() * (total_key_size +
total_val_size));
+ state.SetItemsProcessed(state.iterations() * 2 * test_size);
+}
+
// ----------------------------------------------------------------------
// Benchmark declarations
-BENCHMARK(HashIntegers);
+// Directly uses "Hashing" hash functions from hashing.h (xxHash)
BENCHMARK(HashSmallStrings);
BENCHMARK(HashMediumStrings);
BENCHMARK(HashLargeStrings);
+BENCHMARK(HashIntegers32);
Review Comment:
So, you shouldn't benchmark compute functions outside the `arrow/compute`
directory (these benchmarks must be buildable with compute disabled). Meaning,
you probably need to create a separate
`src/arrow/compute/kernels/scalar_hash_benchmark.cc`.
##########
cpp/src/arrow/compute/kernels/scalar_hash_test.cc:
##########
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
+#include "arrow/util/key_value_metadata.h"
+
+namespace arrow {
+namespace compute {
+
+TEST(TestScalarHash, FastHash64Primitive) {
+ for (auto input_dtype : {int32(), uint32(), int8(), uint8()}) {
+ auto input_arr = ArrayFromJSON(input_dtype, "[3, null, 2, 0, 127, 64]");
+
+ ASSERT_OK_AND_ASSIGN(Datum hash_result, CallFunction("fast_hash_64",
{input_arr}));
Review Comment:
I would like to see some basic validation of the result values in addition
to this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]