[
https://issues.apache.org/jira/browse/ARROW-1828?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16264416#comment-16264416
]
ASF GitHub Bot commented on ARROW-1828:
---------------------------------------
wesm closed pull request #1350: ARROW-1828: [C++] Hash kernel specialization
for BooleanType
URL: https://github.com/apache/arrow/pull/1350
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/compute/compute-test.cc
b/cpp/src/arrow/compute/compute-test.cc
index fa408ae40..96edd8f01 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -843,6 +843,40 @@ TEST_F(TestHashKernel, UniqueTimeTimestamp) {
{});
}
+TEST_F(TestHashKernel, UniqueBoolean) {
+ CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false,
true},
+ {true, false, true, true}, {true, false}, {});
+
+ CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false,
true},
+ {true, false, true, true}, {false, true}, {});
+
+ // No nulls
+ CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false,
true}, {},
+ {true, false}, {});
+
+ CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false,
true}, {},
+ {false, true}, {});
+}
+
+TEST_F(TestHashKernel, DictEncodeBoolean) {
+ CheckDictEncode<BooleanType, bool>(
+ &this->ctx_, boolean(), {true, true, false, true, false},
+ {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1});
+
+ CheckDictEncode<BooleanType, bool>(
+ &this->ctx_, boolean(), {false, true, false, true, false},
+ {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0});
+
+ // No nulls
+ CheckDictEncode<BooleanType, bool>(&this->ctx_, boolean(),
+ {true, true, false, true, false}, {},
{true, false},
+ {}, {0, 0, 1, 0, 1});
+
+ CheckDictEncode<BooleanType, bool>(&this->ctx_, boolean(),
+ {false, true, false, true, false}, {},
{false, true},
+ {}, {0, 1, 0, 1, 0});
+}
+
TEST_F(TestHashKernel, UniqueBinary) {
CheckUnique<BinaryType, std::string>(&this->ctx_, binary(),
{"test", "", "test2", "test"},
diff --git a/cpp/src/arrow/compute/kernels/hash.cc
b/cpp/src/arrow/compute/kernels/hash.cc
index 95f039932..e47759d4d 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -368,6 +368,79 @@ class HashTableKernel<Type, Action,
enable_if_has_c_type<Type>> : public HashTab
HashDictionary<Type> dict_;
};
+// ----------------------------------------------------------------------
+// Hash table for boolean types
+
+template <typename Type, typename Action>
+class HashTableKernel<Type, Action, enable_if_boolean<Type>> : public
HashTable {
+ public:
+ HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : HashTable(type, pool) {
+ std::fill(table_, table_ + 2, kHashSlotEmpty);
+ }
+
+ Status Append(const ArrayData& arr) override {
+ auto action = static_cast<Action*>(this);
+
+ RETURN_NOT_OK(action->Reserve(arr.length));
+
+ internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset,
arr.length);
+
+#define HASH_INNER_LOOP() \
+ if (slot == kHashSlotEmpty) { \
+ if (!Action::allow_expand) { \
+ throw HashException("Encountered new dictionary value"); \
+ } \
+ table_[j] = slot = static_cast<hash_slot_t>(dict_.size()); \
+ dict_.push_back(value); \
+ action->ObserveNotFound(slot); \
+ } else { \
+ action->ObserveFound(slot); \
+ }
+
+ if (arr.null_count != 0) {
+ internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset,
arr.length);
+ for (int64_t i = 0; i < arr.length; ++i) {
+ const bool is_null = valid_reader.IsNotSet();
+ const bool value = value_reader.IsSet();
+ const int j = value ? 1 : 0;
+ hash_slot_t slot = table_[j];
+ valid_reader.Next();
+ value_reader.Next();
+ if (is_null) {
+ action->ObserveNull();
+ continue;
+ }
+ HASH_INNER_LOOP();
+ }
+ } else {
+ for (int64_t i = 0; i < arr.length; ++i) {
+ const bool value = value_reader.IsSet();
+ const int j = value ? 1 : 0;
+ hash_slot_t slot = table_[j];
+ value_reader.Next();
+ HASH_INNER_LOOP();
+ }
+ }
+
+#undef HASH_INNER_LOOP
+
+ return Status::OK();
+ }
+
+ Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+ BooleanBuilder builder(pool_);
+ for (const bool value : dict_) {
+ RETURN_NOT_OK(builder.Append(value));
+ }
+ return builder.FinishInternal(out);
+ }
+
+ private:
+ hash_slot_t table_[2];
+ std::vector<bool> dict_;
+};
+
// ----------------------------------------------------------------------
// Hash table pass for variable-length binary types
@@ -698,7 +771,7 @@ Status GetUniqueKernel(FunctionContext* ctx, const
std::shared_ptr<DataType>& ty
switch (type->id()) {
UNIQUE_CASE(NullType);
- // UNIQUE_CASE(BooleanType);
+ UNIQUE_CASE(BooleanType);
UNIQUE_CASE(UInt8Type);
UNIQUE_CASE(Int8Type);
UNIQUE_CASE(UInt16Type);
@@ -741,7 +814,7 @@ Status GetDictionaryEncodeKernel(FunctionContext* ctx,
switch (type->id()) {
DICTIONARY_ENCODE_CASE(NullType);
- // DICTIONARY_ENCODE_CASE(BooleanType);
+ DICTIONARY_ENCODE_CASE(BooleanType);
DICTIONARY_ENCODE_CASE(UInt8Type);
DICTIONARY_ENCODE_CASE(Int8Type);
DICTIONARY_ENCODE_CASE(UInt16Type);
diff --git a/cpp/src/arrow/compute/kernels/util-internal.h
b/cpp/src/arrow/compute/kernels/util-internal.h
index 70c506286..7633fed4a 100644
--- a/cpp/src/arrow/compute/kernels/util-internal.h
+++ b/cpp/src/arrow/compute/kernels/util-internal.h
@@ -59,6 +59,10 @@ template <typename T>
using enable_if_binary =
typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type;
+template <typename T>
+using enable_if_boolean =
+ typename std::enable_if<std::is_same<BooleanType, T>::value>::type;
+
template <typename T>
using enable_if_fixed_size_binary =
typename std::enable_if<std::is_base_of<FixedSizeBinaryType,
T>::value>::type;
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index f1f59384b..324648178 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -209,7 +209,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int byte_width()
int bit_width()
- cdef cppclass CDecimal128Type"
arrow::Decimal128Type"(CFixedSizeBinaryType):
+ cdef cppclass CDecimal128Type \
+ " arrow::Decimal128Type"(CFixedSizeBinaryType):
CDecimal128Type(int precision, int scale)
int precision()
int scale()
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++] Implement hash kernel specialization for BooleanType
> ----------------------------------------------------------
>
> Key: ARROW-1828
> URL: https://issues.apache.org/jira/browse/ARROW-1828
> Project: Apache Arrow
> Issue Type: Improvement
> Components: C++
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> Follow up to ARROW-1559
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)