This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new bed07e9 ARROW-1942: [C++] Hash table specializations for small integers bed07e9 is described below commit bed07e93b3b068ed5750946212ba2e0e6feaf61a Author: Panchen Xue <pan.panchen....@gmail.com> AuthorDate: Tue Feb 20 12:40:42 2018 -0500 ARROW-1942: [C++] Hash table specializations for small integers Add hash table specializations for int8_t and uint8_t Author: Panchen Xue <pan.panchen....@gmail.com> Closes #1551 from xuepanchen/ARROW-1942 and squashes the following commits: 37c42161 [Panchen Xue] Hash table specializations for small integers --- cpp/src/arrow/compute/compute-benchmark.cc | 22 +++++++- cpp/src/arrow/compute/kernels/hash.cc | 79 ++++++++++++++++++++++++++- cpp/src/arrow/compute/kernels/util-internal.h | 22 ++++++-- 3 files changed, 116 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/compute-benchmark.cc b/cpp/src/arrow/compute/compute-benchmark.cc index 44df441..6460105 100644 --- a/cpp/src/arrow/compute/compute-benchmark.cc +++ b/cpp/src/arrow/compute/compute-benchmark.cc @@ -94,7 +94,7 @@ struct HashParams { std::vector<bool> is_valid; test::randint<int64_t>(length, 0, num_unique, &draws); for (int64_t draw : draws) { - values.push_back(draw); + values.push_back(static_cast<T>(draw)); } if (this->null_percent > 0) { @@ -170,6 +170,14 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params, state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length)); } +static void BM_UniqueUInt8NoNulls(benchmark::State& state) { + BenchUnique(state, HashParams<UInt8Type>{0}, state.range(0), state.range(1)); +} + +static void BM_UniqueUInt8WithNulls(benchmark::State& state) { + BenchUnique(state, HashParams<UInt8Type>{0.05}, state.range(0), state.range(1)); +} + static void BM_UniqueInt64NoNulls(benchmark::State& state) { BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1)); } @@ -207,5 +215,17 @@ ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls)); ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes)); ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes)); +BENCHMARK(BM_UniqueUInt8NoNulls) + ->Args({kHashBenchmarkLength, 200}) + ->MinTime(1.0) + ->Unit(benchmark::kMicrosecond) + ->UseRealTime(); + +BENCHMARK(BM_UniqueUInt8WithNulls) + ->Args({kHashBenchmarkLength, 200}) + ->MinTime(1.0) + ->Unit(benchmark::kMicrosecond) + ->UseRealTime(); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc index da9797f..dbce6e5 100644 --- a/cpp/src/arrow/compute/kernels/hash.cc +++ b/cpp/src/arrow/compute/kernels/hash.cc @@ -221,7 +221,10 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> { } template <typename Type, typename Action> -class HashTableKernel<Type, Action, enable_if_has_c_type<Type>> : public HashTable { +class HashTableKernel< + Type, Action, + typename std::enable_if<has_c_type<Type>::value && !is_8bit_int<Type>::value>::type> + : public HashTable { public: using T = typename Type::c_type; @@ -612,6 +615,80 @@ class HashTableKernel<Type, Action, enable_if_fixed_size_binary<Type>> }; // ---------------------------------------------------------------------- +// Hash table pass for uint8 and int8 + +template <typename T> +inline int Hash8Bit(const T val) { + return 0; +} + +template <> +inline int Hash8Bit(const uint8_t val) { + return val; +} + +template <> +inline int Hash8Bit(const int8_t val) { + return val + 128; +} + +template <typename Type, typename Action> +class HashTableKernel<Type, Action, enable_if_8bit_int<Type>> : public HashTable { + public: + using T = typename Type::c_type; + + HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool) + : HashTable(type, pool) { + std::fill(table_, table_ + 256, kHashSlotEmpty); + } + + Status Append(const ArrayData& arr) override { + const T* values = GetValues<T>(arr, 1); + auto action = static_cast<Action*>(this); + RETURN_NOT_OK(action->Reserve(arr.length)); + +#define HASH_INNER_LOOP() \ + const T value = values[i]; \ + const int hash = Hash8Bit<T>(value); \ + hash_slot_t slot = table_[hash]; \ + \ + if (slot == kHashSlotEmpty) { \ + if (!Action::allow_expand) { \ + throw HashException("Encountered new dictionary value"); \ + } \ + \ + slot = static_cast<hash_slot_t>(dict_.size()); \ + table_[hash] = slot; \ + dict_.push_back(value); \ + action->ObserveNotFound(slot); \ + } else { \ + action->ObserveFound(slot); \ + } + + GENERIC_HASH_PASS(HASH_INNER_LOOP); + +#undef HASH_INNER_LOOP + + return Status::OK(); + } + + Status GetDictionary(std::shared_ptr<ArrayData>* out) override { + using BuilderType = typename TypeTraits<Type>::BuilderType; + BuilderType builder(pool_); + + for (const T value : dict_) { + RETURN_NOT_OK(builder.Append(value)); + } + + return builder.FinishInternal(out); + } + + private: + hash_slot_t table_[256]; + std::vector<T> dict_; +}; + +// ---------------------------------------------------------------------- // Unique implementation template <typename Type> diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h index 2f61132..bde2676 100644 --- a/cpp/src/arrow/compute/kernels/util-internal.h +++ b/cpp/src/arrow/compute/kernels/util-internal.h @@ -33,6 +33,22 @@ template <typename T> using is_number = std::is_base_of<Number, T>; template <typename T> +struct has_c_type { + static constexpr bool value = + (std::is_base_of<PrimitiveCType, T>::value || std::is_base_of<DateType, T>::value || + std::is_base_of<TimeType, T>::value || std::is_base_of<TimestampType, T>::value); +}; + +template <typename T> +struct is_8bit_int { + static constexpr bool value = + (std::is_same<UInt8Type, T>::value || std::is_same<Int8Type, T>::value); +}; + +template <typename T> +using enable_if_8bit_int = typename std::enable_if<is_8bit_int<T>::value>::type; + +template <typename T> using enable_if_primitive_ctype = typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value>::type; @@ -47,11 +63,7 @@ using enable_if_timestamp = typename std::enable_if<std::is_base_of<TimestampType, T>::value>::type; template <typename T> -using enable_if_has_c_type = - typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value || - std::is_base_of<DateType, T>::value || - std::is_base_of<TimeType, T>::value || - std::is_base_of<TimestampType, T>::value>::type; +using enable_if_has_c_type = typename std::enable_if<has_c_type<T>::value>::type; template <typename T> using enable_if_null = typename std::enable_if<std::is_same<NullType, T>::value>::type; -- To stop receiving notification emails like this one, please contact w...@apache.org.