[
https://issues.apache.org/jira/browse/ARROW-1844?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16270968#comment-16270968
]
ASF GitHub Bot commented on ARROW-1844:
---------------------------------------
wesm closed pull request #1370: ARROW-1844: [C++] Add initial Unique benchmarks
for int64, variable-length strings
URL: https://github.com/apache/arrow/pull/1370
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/compute/compute-benchmark.cc
b/cpp/src/arrow/compute/compute-benchmark.cc
index 974fffcd6..aa7d899c8 100644
--- a/cpp/src/arrow/compute/compute-benchmark.cc
+++ b/cpp/src/arrow/compute/compute-benchmark.cc
@@ -81,8 +81,131 @@ static void BM_BuildStringDictionary(
state.SetBytesProcessed(state.iterations() * total_bytes);
}
-BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+template <typename Type>
+struct HashParams {
+ using T = typename Type::c_type;
+
+ double null_percent;
+
+ void GenerateTestData(const int64_t length, const int64_t num_unique,
+ std::shared_ptr<Array>* arr) const {
+ std::vector<int64_t> draws;
+ std::vector<T> values;
+ std::vector<bool> is_valid;
+ test::randint<int64_t>(length, 0, num_unique, &draws);
+ for (int64_t draw : draws) {
+ values.push_back(draw);
+ }
+
+ if (this->null_percent > 0) {
+ test::random_is_valid(length, this->null_percent, &is_valid);
+ ArrayFromVector<Type, T>(is_valid, values, arr);
+ } else {
+ ArrayFromVector<Type, T>(values, arr);
+ }
+ }
+
+ int64_t GetBytesProcessed(int64_t length) const { return length * sizeof(T);
}
+};
+
+template <>
+struct HashParams<StringType> {
+ double null_percent;
+ int32_t byte_width;
+ void GenerateTestData(const int64_t length, const int64_t num_unique,
+ std::shared_ptr<Array>* arr) const {
+ std::vector<int64_t> draws;
+ test::randint<int64_t>(length, 0, num_unique, &draws);
+
+ const int64_t total_bytes = this->byte_width * num_unique;
+ std::vector<uint8_t> uniques(total_bytes);
+ const uint32_t seed = 0;
+ test::random_bytes(total_bytes, seed, uniques.data());
+
+ std::vector<bool> is_valid;
+ if (this->null_percent > 0) {
+ test::random_is_valid(length, this->null_percent, &is_valid);
+ }
+
+ StringBuilder builder;
+ for (int64_t i = 0; i < length; ++i) {
+ if (this->null_percent == 0 || is_valid[i]) {
+ ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width *
draws[i],
+ this->byte_width));
+ } else {
+ ABORT_NOT_OK(builder.AppendNull());
+ }
+ }
+ ABORT_NOT_OK(builder.Finish(arr));
+ }
+
+ int64_t GetBytesProcessed(int64_t length) const { return length *
byte_width; }
+};
+
+template <typename ParamType>
+void BenchUnique(benchmark::State& state, const ParamType& params, int64_t
length,
+ int64_t num_unique) {
+ std::shared_ptr<Array> arr;
+ params.GenerateTestData(length, num_unique, &arr);
+
+ FunctionContext ctx;
+ while (state.KeepRunning()) {
+ std::shared_ptr<Array> out;
+ ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out));
+ }
+ state.SetBytesProcessed(state.iterations() *
params.GetBytesProcessed(length));
+}
+
+template <typename ParamType>
+void BenchDictionaryEncode(benchmark::State& state, const ParamType& params,
+ int64_t length, int64_t num_unique) {
+ std::shared_ptr<Array> arr;
+ params.GenerateTestData(length, num_unique, &arr);
+
+ FunctionContext ctx;
+ while (state.KeepRunning()) {
+ Datum out;
+ ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
+ }
+ state.SetBytesProcessed(state.iterations() *
params.GetBytesProcessed(length));
+}
+
+static void BM_UniqueInt64NoNulls(benchmark::State& state) {
+ BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueInt64WithNulls(benchmark::State& state) {
+ BenchUnique(state, HashParams<Int64Type>{0.05}, state.range(0),
state.range(1));
+}
+
+static void BM_UniqueString10bytes(benchmark::State& state) {
+ // Byte strings with 10 bytes each
+ BenchUnique(state, HashParams<StringType>{0.05, 10}, state.range(0),
state.range(1));
+}
+
+static void BM_UniqueString100bytes(benchmark::State& state) {
+ // Byte strings with 100 bytes each
+ BenchUnique(state, HashParams<StringType>{0.05, 100}, state.range(0),
state.range(1));
+}
+
+BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+
+constexpr int64_t kHashBenchmarkLength = 1 << 24;
+
+#define ADD_HASH_ARGS(WHAT) \
+ WHAT->Args({kHashBenchmarkLength, 50}) \
+ ->Args({kHashBenchmarkLength, 1 << 10}) \
+ ->Args({kHashBenchmarkLength, 10 * 1 << 10}) \
+ ->Args({kHashBenchmarkLength, 1 << 20}) \
+ ->MinTime(1.0) \
+ ->Unit(benchmark::kMicrosecond) \
+ ->UseRealTime()
+
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/compute-test.cc
b/cpp/src/arrow/compute/compute-test.cc
index c73bfa309..84af8f7c6 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -869,8 +869,8 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
return;
}
- const int64_t kTotalValues = 10000;
- const int64_t kRepeats = 10;
+ const int64_t kTotalValues = 1000000;
+ const int64_t kRepeats = 5;
vector<T> values;
vector<T> uniques;
diff --git a/cpp/src/arrow/compute/kernels/hash.cc
b/cpp/src/arrow/compute/kernels/hash.cc
index 66c907369..750f1d36a 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -43,7 +43,7 @@ typedef int32_t hash_slot_t;
static constexpr hash_slot_t kHashSlotEmpty =
std::numeric_limits<int32_t>::max();
// The maximum load factor for the hash table before resizing.
-static constexpr double kMaxHashTableLoad = 0.7;
+static constexpr double kMaxHashTableLoad = 0.5;
enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
@@ -260,7 +260,7 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
COMPUTE_HASH;
\
while (kHashSlotEmpty != new_hash_slots[j]) {
\
++j;
\
- if (ARROW_PREDICT_FALSE(j == hash_table_size_)) {
\
+ if (ARROW_PREDICT_FALSE(j == new_size)) {
\
j = 0;
\
}
\
}
\
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++] Basic benchmark suite for hash kernels
> --------------------------------------------
>
> Key: ARROW-1844
> URL: https://issues.apache.org/jira/browse/ARROW-1844
> Project: Apache Arrow
> Issue Type: New Feature
> Components: C++
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> * Integers, small cardinality and large cardinality
> * Short strings, small/large cardinality
> * Long strings, small/large cardinality
> These benchmarks will enable us to refactor without fear, and to experiment
> with faster hash functions
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)