[ 
https://issues.apache.org/jira/browse/ARROW-1942?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16370337#comment-16370337
 ] 

ASF GitHub Bot commented on ARROW-1942:
---------------------------------------

wesm closed pull request #1551: ARROW-1942: [C++] Hash table specializations 
for small integers
URL: https://github.com/apache/arrow/pull/1551
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/compute/compute-benchmark.cc 
b/cpp/src/arrow/compute/compute-benchmark.cc
index 44df44139..646010550 100644
--- a/cpp/src/arrow/compute/compute-benchmark.cc
+++ b/cpp/src/arrow/compute/compute-benchmark.cc
@@ -94,7 +94,7 @@ struct HashParams {
     std::vector<bool> is_valid;
     test::randint<int64_t>(length, 0, num_unique, &draws);
     for (int64_t draw : draws) {
-      values.push_back(draw);
+      values.push_back(static_cast<T>(draw));
     }
 
     if (this->null_percent > 0) {
@@ -170,6 +170,14 @@ void BenchDictionaryEncode(benchmark::State& state, const 
ParamType& params,
   state.SetBytesProcessed(state.iterations() * 
params.GetBytesProcessed(length));
 }
 
+static void BM_UniqueUInt8NoNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<UInt8Type>{0}, state.range(0), state.range(1));
+}
+
+static void BM_UniqueUInt8WithNulls(benchmark::State& state) {
+  BenchUnique(state, HashParams<UInt8Type>{0.05}, state.range(0), 
state.range(1));
+}
+
 static void BM_UniqueInt64NoNulls(benchmark::State& state) {
   BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
 }
@@ -207,5 +215,17 @@ ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
 ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
 ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
 
+BENCHMARK(BM_UniqueUInt8NoNulls)
+    ->Args({kHashBenchmarkLength, 200})
+    ->MinTime(1.0)
+    ->Unit(benchmark::kMicrosecond)
+    ->UseRealTime();
+
+BENCHMARK(BM_UniqueUInt8WithNulls)
+    ->Args({kHashBenchmarkLength, 200})
+    ->MinTime(1.0)
+    ->Unit(benchmark::kMicrosecond)
+    ->UseRealTime();
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/hash.cc 
b/cpp/src/arrow/compute/kernels/hash.cc
index da9797f77..dbce6e561 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -221,7 +221,10 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
   }
 
 template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_has_c_type<Type>> : public 
HashTable {
+class HashTableKernel<
+    Type, Action,
+    typename std::enable_if<has_c_type<Type>::value && 
!is_8bit_int<Type>::value>::type>
+    : public HashTable {
  public:
   using T = typename Type::c_type;
 
@@ -611,6 +614,80 @@ class HashTableKernel<Type, Action, 
enable_if_fixed_size_binary<Type>>
   int32_t dict_size_;
 };
 
+// ----------------------------------------------------------------------
+// Hash table pass for uint8 and int8
+
+template <typename T>
+inline int Hash8Bit(const T val) {
+  return 0;
+}
+
+template <>
+inline int Hash8Bit(const uint8_t val) {
+  return val;
+}
+
+template <>
+inline int Hash8Bit(const int8_t val) {
+  return val + 128;
+}
+
+template <typename Type, typename Action>
+class HashTableKernel<Type, Action, enable_if_8bit_int<Type>> : public 
HashTable {
+ public:
+  using T = typename Type::c_type;
+
+  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : HashTable(type, pool) {
+    std::fill(table_, table_ + 256, kHashSlotEmpty);
+  }
+
+  Status Append(const ArrayData& arr) override {
+    const T* values = GetValues<T>(arr, 1);
+    auto action = static_cast<Action*>(this);
+    RETURN_NOT_OK(action->Reserve(arr.length));
+
+#define HASH_INNER_LOOP()                                      \
+  const T value = values[i];                                   \
+  const int hash = Hash8Bit<T>(value);                         \
+  hash_slot_t slot = table_[hash];                             \
+                                                               \
+  if (slot == kHashSlotEmpty) {                                \
+    if (!Action::allow_expand) {                               \
+      throw HashException("Encountered new dictionary value"); \
+    }                                                          \
+                                                               \
+    slot = static_cast<hash_slot_t>(dict_.size());             \
+    table_[hash] = slot;                                       \
+    dict_.push_back(value);                                    \
+    action->ObserveNotFound(slot);                             \
+  } else {                                                     \
+    action->ObserveFound(slot);                                \
+  }
+
+    GENERIC_HASH_PASS(HASH_INNER_LOOP);
+
+#undef HASH_INNER_LOOP
+
+    return Status::OK();
+  }
+
+  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+    using BuilderType = typename TypeTraits<Type>::BuilderType;
+    BuilderType builder(pool_);
+
+    for (const T value : dict_) {
+      RETURN_NOT_OK(builder.Append(value));
+    }
+
+    return builder.FinishInternal(out);
+  }
+
+ private:
+  hash_slot_t table_[256];
+  std::vector<T> dict_;
+};
+
 // ----------------------------------------------------------------------
 // Unique implementation
 
diff --git a/cpp/src/arrow/compute/kernels/util-internal.h 
b/cpp/src/arrow/compute/kernels/util-internal.h
index 2f611320a..bde267629 100644
--- a/cpp/src/arrow/compute/kernels/util-internal.h
+++ b/cpp/src/arrow/compute/kernels/util-internal.h
@@ -32,6 +32,22 @@ class FunctionContext;
 template <typename T>
 using is_number = std::is_base_of<Number, T>;
 
+template <typename T>
+struct has_c_type {
+  static constexpr bool value =
+      (std::is_base_of<PrimitiveCType, T>::value || std::is_base_of<DateType, 
T>::value ||
+       std::is_base_of<TimeType, T>::value || std::is_base_of<TimestampType, 
T>::value);
+};
+
+template <typename T>
+struct is_8bit_int {
+  static constexpr bool value =
+      (std::is_same<UInt8Type, T>::value || std::is_same<Int8Type, T>::value);
+};
+
+template <typename T>
+using enable_if_8bit_int = typename 
std::enable_if<is_8bit_int<T>::value>::type;
+
 template <typename T>
 using enable_if_primitive_ctype =
     typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value>::type;
@@ -47,11 +63,7 @@ using enable_if_timestamp =
     typename std::enable_if<std::is_base_of<TimestampType, T>::value>::type;
 
 template <typename T>
-using enable_if_has_c_type =
-    typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value ||
-                            std::is_base_of<DateType, T>::value ||
-                            std::is_base_of<TimeType, T>::value ||
-                            std::is_base_of<TimestampType, T>::value>::type;
+using enable_if_has_c_type = typename 
std::enable_if<has_c_type<T>::value>::type;
 
 template <typename T>
 using enable_if_null = typename std::enable_if<std::is_same<NullType, 
T>::value>::type;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Hash table specializations for small integers
> ---------------------------------------------------
>
>                 Key: ARROW-1942
>                 URL: https://issues.apache.org/jira/browse/ARROW-1942
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: C++
>            Reporter: Wes McKinney
>            Assignee: Panchen Xue
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> There is no need to use a dynamically-sized hash table with uint8, int8, 
> since a fixed-size lookup table can be used and avoid hashing altogether



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to