This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new aa3328d  ARROW-1844: [C++] Add initial Unique benchmarks for int64, 
variable-length strings
aa3328d is described below

commit aa3328d53dc9aa0447097d0173990ff4ac41ce75
Author: Wes McKinney <wes.mckin...@twosigma.com>
AuthorDate: Wed Nov 29 10:43:20 2017 -0500

    ARROW-1844: [C++] Add initial Unique benchmarks for int64, variable-length 
strings
    
    I also fixed a bug this surfaced in the hash table resize (unit test 
coverage was not adequate)
    
    Now we have
    
    ```
    $ ./release/compute-benchmark
    Run on (8 X 4200.16 MHz CPU s)
    2017-11-28 18:33:53
    Benchmark                                                           Time    
       CPU Iterations
    
-------------------------------------------------------------------------------------------------
    BM_BuildDictionary/min_time:1.000                                1352 us    
   1352 us       1038   2.88639GB/s
    BM_BuildStringDictionary/min_time:1.000                          3994 us    
   3994 us        351   75.5809MB/s
    BM_UniqueInt64NoNulls/16M/50/min_time:1.000/real_time           35814 us    
  35816 us         39   3.49023GB/s
    BM_UniqueInt64NoNulls/16M/1024/min_time:1.000/real_time        119656 us    
 119660 us         12   1069.73MB/s
    BM_UniqueInt64NoNulls/16M/10k/min_time:1.000/real_time         174924 us    
 174930 us          8   731.747MB/s
    BM_UniqueInt64NoNulls/16M/1024k/min_time:1.000/real_time       448425 us    
 448440 us          3   285.443MB/s
    BM_UniqueInt64WithNulls/16M/50/min_time:1.000/real_time         49511 us    
  49513 us         29   2.52468GB/s
    BM_UniqueInt64WithNulls/16M/1024/min_time:1.000/real_time      134519 us    
 134523 us         10   951.541MB/s
    BM_UniqueInt64WithNulls/16M/10k/min_time:1.000/real_time       191331 us    
 191336 us          7   668.999MB/s
    BM_UniqueInt64WithNulls/16M/1024k/min_time:1.000/real_time     533597 us    
 533613 us          3   239.882MB/s
    BM_UniqueString10bytes/16M/50/min_time:1.000/real_time         150731 us    
 150736 us          9    1061.5MB/s
    BM_UniqueString10bytes/16M/1024/min_time:1.000/real_time       256929 us    
 256938 us          5   622.739MB/s
    BM_UniqueString10bytes/16M/10k/min_time:1.000/real_time        414412 us    
 414426 us          3    386.09MB/s
    BM_UniqueString10bytes/16M/1024k/min_time:1.000/real_time     1744253 us    
1744308 us          1   91.7298MB/s
    BM_UniqueString100bytes/16M/50/min_time:1.000/real_time        563890 us    
 563909 us          2   2.77093GB/s
    BM_UniqueString100bytes/16M/1024/min_time:1.000/real_time      704695 us    
 704720 us          2   2.21727GB/s
    BM_UniqueString100bytes/16M/10k/min_time:1.000/real_time       995685 us    
 995721 us          2   1.56927GB/s
    BM_UniqueString100bytes/16M/1024k/min_time:1.000/real_time    3584108 us    
3584230 us          1   446.415MB/s
    ```
    
    We can also refactor the hash table implementations without worrying too 
much about whether we're making things slower
    
    Author: Wes McKinney <wes.mckin...@twosigma.com>
    
    Closes #1370 from wesm/ARROW-1844 and squashes the following commits:
    
    638f1a11 [Wes McKinney] Decrease resize load factor to 0.5
    2885c645 [Wes McKinney] Multiply bytes processed by state.iterations()
    f7b36194 [Wes McKinney] Add initial Unique benchmarks for int64, strings
---
 cpp/src/arrow/compute/compute-benchmark.cc | 127 ++++++++++++++++++++++++++++-
 cpp/src/arrow/compute/compute-test.cc      |   4 +-
 cpp/src/arrow/compute/kernels/hash.cc      |   4 +-
 3 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/compute-benchmark.cc 
b/cpp/src/arrow/compute/compute-benchmark.cc
index 974fffc..aa7d899 100644
--- a/cpp/src/arrow/compute/compute-benchmark.cc
+++ b/cpp/src/arrow/compute/compute-benchmark.cc
@@ -81,8 +81,131 @@ static void BM_BuildStringDictionary(
   state.SetBytesProcessed(state.iterations() * total_bytes);
 }
 
-BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+template <typename Type>
+struct HashParams {
+  using T = typename Type::c_type;
+
+  double null_percent;
+
+  void GenerateTestData(const int64_t length, const int64_t num_unique,
+                        std::shared_ptr<Array>* arr) const {
+    std::vector<int64_t> draws;
+    std::vector<T> values;
+    std::vector<bool> is_valid;
+    test::randint<int64_t>(length, 0, num_unique, &draws);
+    for (int64_t draw : draws) {
+      values.push_back(draw);
+    }
+
+    if (this->null_percent > 0) {
+      test::random_is_valid(length, this->null_percent, &is_valid);
+      ArrayFromVector<Type, T>(is_valid, values, arr);
+    } else {
+      ArrayFromVector<Type, T>(values, arr);
+    }
+  }
+
+  int64_t GetBytesProcessed(int64_t length) const { return length * sizeof(T); 
}
+};
+
+template <>
+struct HashParams<StringType> {
+  double null_percent;
+  int32_t byte_width;
+  void GenerateTestData(const int64_t length, const int64_t num_unique,
+                        std::shared_ptr<Array>* arr) const {
+    std::vector<int64_t> draws;
+    test::randint<int64_t>(length, 0, num_unique, &draws);
+
+    const int64_t total_bytes = this->byte_width * num_unique;
+    std::vector<uint8_t> uniques(total_bytes);
+    const uint32_t seed = 0;
+    test::random_bytes(total_bytes, seed, uniques.data());
+
+    std::vector<bool> is_valid;
+    if (this->null_percent > 0) {
+      test::random_is_valid(length, this->null_percent, &is_valid);
+    }
+
+    StringBuilder builder;
+    for (int64_t i = 0; i < length; ++i) {
+      if (this->null_percent == 0 || is_valid[i]) {
+        ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width * 
draws[i],
+                                    this->byte_width));
+      } else {
+        ABORT_NOT_OK(builder.AppendNull());
+      }
+    }
+    ABORT_NOT_OK(builder.Finish(arr));
+  }
+
+  int64_t GetBytesProcessed(int64_t length) const { return length * 
byte_width; }
+};
+
+template <typename ParamType>
+void BenchUnique(benchmark::State& state, const ParamType& params, int64_t 
length,
+                 int64_t num_unique) {
+  std::shared_ptr<Array> arr;
+  params.GenerateTestData(length, num_unique, &arr);
+
+  FunctionContext ctx;
+  while (state.KeepRunning()) {
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out));
+  }
+  state.SetBytesProcessed(state.iterations() * 
params.GetBytesProcessed(length));
+}
+
+template <typename ParamType>
+void BenchDictionaryEncode(benchmark::State& state, const ParamType& params,
+                           int64_t length, int64_t num_unique) {
+  std::shared_ptr<Array> arr;
+  params.GenerateTestData(length, num_unique, &arr);
+
+  FunctionContext ctx;
+  while (state.KeepRunning()) {
+    Datum out;
+    ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
+  }
+  state.SetBytesProcessed(state.iterations() * 
params.GetBytesProcessed(length));
+}
+
+static void BM_UniqueInt64NoNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueInt64WithNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<Int64Type>{0.05}, state.range(0), 
state.range(1));
+}
+
+static void BM_UniqueString10bytes(benchmark::State& state) {
+  // Byte strings with 10 bytes each
+  BenchUnique(state, HashParams<StringType>{0.05, 10}, state.range(0), 
state.range(1));
+}
+
+static void BM_UniqueString100bytes(benchmark::State& state) {
+  // Byte strings with 100 bytes each
+  BenchUnique(state, HashParams<StringType>{0.05, 100}, state.range(0), 
state.range(1));
+}
+
+BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
+
+constexpr int64_t kHashBenchmarkLength = 1 << 24;
+
+#define ADD_HASH_ARGS(WHAT)                        \
+  WHAT->Args({kHashBenchmarkLength, 50})           \
+      ->Args({kHashBenchmarkLength, 1 << 10})      \
+      ->Args({kHashBenchmarkLength, 10 * 1 << 10}) \
+      ->Args({kHashBenchmarkLength, 1 << 20})      \
+      ->MinTime(1.0)                               \
+      ->Unit(benchmark::kMicrosecond)              \
+      ->UseRealTime()
+
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
+ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/compute-test.cc 
b/cpp/src/arrow/compute/compute-test.cc
index c73bfa3..84af8f7 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -869,8 +869,8 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
     return;
   }
 
-  const int64_t kTotalValues = 10000;
-  const int64_t kRepeats = 10;
+  const int64_t kTotalValues = 1000000;
+  const int64_t kRepeats = 5;
 
   vector<T> values;
   vector<T> uniques;
diff --git a/cpp/src/arrow/compute/kernels/hash.cc 
b/cpp/src/arrow/compute/kernels/hash.cc
index 66c9073..750f1d3 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -43,7 +43,7 @@ typedef int32_t hash_slot_t;
 static constexpr hash_slot_t kHashSlotEmpty = 
std::numeric_limits<int32_t>::max();
 
 // The maximum load factor for the hash table before resizing.
-static constexpr double kMaxHashTableLoad = 0.7;
+static constexpr double kMaxHashTableLoad = 0.5;
 
 enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
 
@@ -260,7 +260,7 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
       COMPUTE_HASH;                                                            
  \
       while (kHashSlotEmpty != new_hash_slots[j]) {                            
  \
         ++j;                                                                   
  \
-        if (ARROW_PREDICT_FALSE(j == hash_table_size_)) {                      
  \
+        if (ARROW_PREDICT_FALSE(j == new_size)) {                              
  \
           j = 0;                                                               
  \
         }                                                                      
  \
       }                                                                        
  \

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Reply via email to