This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new bed07e9  ARROW-1942: [C++] Hash table specializations for small 
integers
bed07e9 is described below

commit bed07e93b3b068ed5750946212ba2e0e6feaf61a
Author: Panchen Xue <pan.panchen....@gmail.com>
AuthorDate: Tue Feb 20 12:40:42 2018 -0500

    ARROW-1942: [C++] Hash table specializations for small integers
    
    Add hash table specializations for int8_t and uint8_t
    
    Author: Panchen Xue <pan.panchen....@gmail.com>
    
    Closes #1551 from xuepanchen/ARROW-1942 and squashes the following commits:
    
    37c42161 [Panchen Xue] Hash table specializations for small integers
---
 cpp/src/arrow/compute/compute-benchmark.cc    | 22 +++++++-
 cpp/src/arrow/compute/kernels/hash.cc         | 79 ++++++++++++++++++++++++++-
 cpp/src/arrow/compute/kernels/util-internal.h | 22 ++++++--
 3 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/compute-benchmark.cc 
b/cpp/src/arrow/compute/compute-benchmark.cc
index 44df441..6460105 100644
--- a/cpp/src/arrow/compute/compute-benchmark.cc
+++ b/cpp/src/arrow/compute/compute-benchmark.cc
@@ -94,7 +94,7 @@ struct HashParams {
     std::vector<bool> is_valid;
     test::randint<int64_t>(length, 0, num_unique, &draws);
     for (int64_t draw : draws) {
-      values.push_back(draw);
+      values.push_back(static_cast<T>(draw));
     }
 
     if (this->null_percent > 0) {
@@ -170,6 +170,14 @@ void BenchDictionaryEncode(benchmark::State& state, const 
ParamType& params,
   state.SetBytesProcessed(state.iterations() * 
params.GetBytesProcessed(length));
 }
 
+static void BM_UniqueUInt8NoNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<UInt8Type>{0}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueUInt8WithNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<UInt8Type>{0.05}, state.range(0), 
state.range(1));
+}
+
 static void BM_UniqueInt64NoNulls(benchmark::State& state) {
   BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
 }
@@ -207,5 +215,17 @@ ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
 ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
 ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
 
+BENCHMARK(BM_UniqueUInt8NoNulls)
+    ->Args({kHashBenchmarkLength, 200})
+    ->MinTime(1.0)
+    ->Unit(benchmark::kMicrosecond)
+    ->UseRealTime();
+
+BENCHMARK(BM_UniqueUInt8WithNulls)
+    ->Args({kHashBenchmarkLength, 200})
+    ->MinTime(1.0)
+    ->Unit(benchmark::kMicrosecond)
+    ->UseRealTime();
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/hash.cc 
b/cpp/src/arrow/compute/kernels/hash.cc
index da9797f..dbce6e5 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -221,7 +221,10 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
   }
 
 template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_has_c_type<Type>> : public 
HashTable {
+class HashTableKernel<
+    Type, Action,
+    typename std::enable_if<has_c_type<Type>::value && 
!is_8bit_int<Type>::value>::type>
+    : public HashTable {
  public:
   using T = typename Type::c_type;
 
@@ -612,6 +615,80 @@ class HashTableKernel<Type, Action, 
enable_if_fixed_size_binary<Type>>
 };
 
 // ----------------------------------------------------------------------
+// Hash table pass for uint8 and int8
+
+template <typename T>
+inline int Hash8Bit(const T val) {
+  return 0;
+}
+
+template <>
+inline int Hash8Bit(const uint8_t val) {
+  return val;
+}
+
+template <>
+inline int Hash8Bit(const int8_t val) {
+  return val + 128;
+}
+
+template <typename Type, typename Action>
+class HashTableKernel<Type, Action, enable_if_8bit_int<Type>> : public 
HashTable {
+ public:
+  using T = typename Type::c_type;
+
+  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : HashTable(type, pool) {
+    std::fill(table_, table_ + 256, kHashSlotEmpty);
+  }
+
+  Status Append(const ArrayData& arr) override {
+    const T* values = GetValues<T>(arr, 1);
+    auto action = static_cast<Action*>(this);
+    RETURN_NOT_OK(action->Reserve(arr.length));
+
+#define HASH_INNER_LOOP()                                      \
+  const T value = values[i];                                   \
+  const int hash = Hash8Bit<T>(value);                         \
+  hash_slot_t slot = table_[hash];                             \
+                                                               \
+  if (slot == kHashSlotEmpty) {                                \
+    if (!Action::allow_expand) {                               \
+      throw HashException("Encountered new dictionary value"); \
+    }                                                          \
+                                                               \
+    slot = static_cast<hash_slot_t>(dict_.size());             \
+    table_[hash] = slot;                                       \
+    dict_.push_back(value);                                    \
+    action->ObserveNotFound(slot);                             \
+  } else {                                                     \
+    action->ObserveFound(slot);                                \
+  }
+
+    GENERIC_HASH_PASS(HASH_INNER_LOOP);
+
+#undef HASH_INNER_LOOP
+
+    return Status::OK();
+  }
+
+  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+    using BuilderType = typename TypeTraits<Type>::BuilderType;
+    BuilderType builder(pool_);
+
+    for (const T value : dict_) {
+      RETURN_NOT_OK(builder.Append(value));
+    }
+
+    return builder.FinishInternal(out);
+  }
+
+ private:
+  hash_slot_t table_[256];
+  std::vector<T> dict_;
+};
+
+// ----------------------------------------------------------------------
 // Unique implementation
 
 template <typename Type>
diff --git a/cpp/src/arrow/compute/kernels/util-internal.h 
b/cpp/src/arrow/compute/kernels/util-internal.h
index 2f61132..bde2676 100644
--- a/cpp/src/arrow/compute/kernels/util-internal.h
+++ b/cpp/src/arrow/compute/kernels/util-internal.h
@@ -33,6 +33,22 @@ template <typename T>
 using is_number = std::is_base_of<Number, T>;
 
 template <typename T>
+struct has_c_type {
+  static constexpr bool value =
+      (std::is_base_of<PrimitiveCType, T>::value || std::is_base_of<DateType, 
T>::value ||
+       std::is_base_of<TimeType, T>::value || std::is_base_of<TimestampType, 
T>::value);
+};
+
+template <typename T>
+struct is_8bit_int {
+  static constexpr bool value =
+      (std::is_same<UInt8Type, T>::value || std::is_same<Int8Type, T>::value);
+};
+
+template <typename T>
+using enable_if_8bit_int = typename 
std::enable_if<is_8bit_int<T>::value>::type;
+
+template <typename T>
 using enable_if_primitive_ctype =
     typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value>::type;
 
@@ -47,11 +63,7 @@ using enable_if_timestamp =
     typename std::enable_if<std::is_base_of<TimestampType, T>::value>::type;
 
 template <typename T>
-using enable_if_has_c_type =
-    typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value ||
-                            std::is_base_of<DateType, T>::value ||
-                            std::is_base_of<TimeType, T>::value ||
-                            std::is_base_of<TimestampType, T>::value>::type;
+using enable_if_has_c_type = typename 
std::enable_if<has_c_type<T>::value>::type;
 
 template <typename T>
 using enable_if_null = typename std::enable_if<std::is_same<NullType, 
T>::value>::type;

-- 
To stop receiving notification emails like this one, please contact
w...@apache.org.

Reply via email to