This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b1f1ef4cda GH-38457: [C++] Support LogicalNullCount for
DictionaryArray (#38681)
b1f1ef4cda is described below
commit b1f1ef4cda17cabede37471f809e07119699fbdb
Author: Junming Chen <[email protected]>
AuthorDate: Mon Nov 27 22:32:27 2023 +0800
GH-38457: [C++] Support LogicalNullCount for DictionaryArray (#38681)
### Rationale for this change
Currently, the `null_count` only returns the number of null values in
`indices`. We want to count the real nulls as the dictionary array is decoded.
### What changes are included in this PR?
Add `DictionaryMayHaveLogicalNulls` and `ComputeLogicalNullCount` for
DictionaryArray
### Are these changes tested?
Yes
### Are there any user-facing changes?
No.
* Closes: #38457
Lead-authored-by: Junming Chen <[email protected]>
Co-authored-by: Benjamin Kietzman <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/array/array_test.cc | 57 +++++++++++++++++++++++++++
cpp/src/arrow/array/data.cc | 14 ++++++-
cpp/src/arrow/array/data.h | 14 +++++--
cpp/src/arrow/util/dict_util.cc | 81 +++++++++++++++++++++++++++++++++++++++
cpp/src/arrow/util/dict_util.h | 28 ++++++++++++++
6 files changed, 191 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index b64194aa6e..46a7aa9106 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -223,6 +223,7 @@ set(ARROW_SRCS
util/debug.cc
util/decimal.cc
util/delimiting.cc
+ util/dict_util.cc
util/float16.cc
util/formatting.cc
util/future.cc
diff --git a/cpp/src/arrow/array/array_test.cc
b/cpp/src/arrow/array/array_test.cc
index be54d62fd7..974eb54d2c 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -80,6 +80,22 @@ class TestArray : public ::testing::Test {
MemoryPool* pool_;
};
+void CheckDictionaryNullCount(const std::shared_ptr<DataType>& dict_type,
+ const std::string& input_dictionary_json,
+ const std::string& input_index_json,
+ const int64_t& expected_null_count,
+ const int64_t& expected_logical_null_count,
+ bool expected_may_have_nulls,
+ bool expected_may_have_logical_nulls) {
+ std::shared_ptr<arrow::Array> arr =
+ DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json);
+
+ ASSERT_EQ(arr->null_count(), expected_null_count);
+ ASSERT_EQ(arr->ComputeLogicalNullCount(), expected_logical_null_count);
+ ASSERT_EQ(arr->data()->MayHaveNulls(), expected_may_have_nulls);
+ ASSERT_EQ(arr->data()->MayHaveLogicalNulls(),
expected_may_have_logical_nulls);
+}
+
TEST_F(TestArray, TestNullCount) {
// These are placeholders
auto data = std::make_shared<Buffer>(nullptr, 0);
@@ -127,6 +143,37 @@ TEST_F(TestArray, TestNullCount) {
ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());
+
+ // Dictionary type
+ std::shared_ptr<arrow::DataType> type;
+ std::shared_ptr<arrow::DataType> dict_type;
+
+ for (const auto& index_type : all_dictionary_index_types()) {
+ ARROW_SCOPED_TRACE("index_type = ", index_type->ToString());
+
+ type = boolean();
+ dict_type = dictionary(index_type, type);
+ // no null value
+ CheckDictionaryNullCount(dict_type, "[]", "[]", 0, 0, false, false);
+ CheckDictionaryNullCount(dict_type, "[true, false]", "[0, 1, 0]", 0, 0,
false, false);
+
+ // only indices contain null value
+ CheckDictionaryNullCount(dict_type, "[true, false]", "[null, 0, 1]", 1, 1,
true,
+ true);
+ CheckDictionaryNullCount(dict_type, "[true, false]", "[null, null]", 2, 2,
true,
+ true);
+
+ // only dictionary contains null value
+ CheckDictionaryNullCount(dict_type, "[null, true]", "[]", 0, 0, false,
true);
+ CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0]", 0,
2, false,
+ true);
+
+ // both indices and dictionary contain null value
+ CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0,
null]", 1, 3,
+ true, true);
+ CheckDictionaryNullCount(dict_type, "[null, true, null, false]", "[null,
1, 0, 2, 3]",
+ 1, 3, true, true);
+ }
}
TEST_F(TestArray, TestSlicePreservesAllNullCount) {
@@ -137,6 +184,16 @@ TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Int32Array arr(/*length=*/100, data, null_bitmap,
/*null_count*/ 100);
EXPECT_EQ(arr.Slice(1, 99)->data()->null_count, arr.Slice(1, 99)->length());
+
+ // Dictionary type
+ std::shared_ptr<arrow::DataType> dict_type = dictionary(int64(), boolean());
+ std::shared_ptr<arrow::Array> dict_arr =
+ DictArrayFromJSON(dict_type, /*indices=*/"[null, 0, 0, 0, 0, 0, 1, 2, 0,
0]",
+ /*dictionary=*/"[null, true, false]");
+ ASSERT_EQ(dict_arr->null_count(), 1);
+ ASSERT_EQ(dict_arr->ComputeLogicalNullCount(), 8);
+ ASSERT_EQ(dict_arr->Slice(2, 8)->null_count(), 0);
+ ASSERT_EQ(dict_arr->Slice(2, 8)->ComputeLogicalNullCount(), 6);
}
TEST_F(TestArray, TestLength) {
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 3241dc5518..c002c0817b 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -33,6 +33,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/dict_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ree_util.h"
@@ -93,6 +94,10 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data)
{
return ArraySpan(data).MayHaveLogicalNulls();
}
+bool DictionaryMayHaveLogicalNulls(const ArrayData& data) {
+ return ArraySpan(data).MayHaveLogicalNulls();
+}
+
BufferSpan PackVariadicBuffers(util::span<const std::shared_ptr<Buffer>>
buffers) {
return {const_cast<uint8_t*>(reinterpret_cast<const
uint8_t*>(buffers.data())),
static_cast<int64_t>(buffers.size() *
sizeof(std::shared_ptr<Buffer>))};
@@ -174,7 +179,7 @@ int64_t ArrayData::GetNullCount() const {
}
int64_t ArrayData::ComputeLogicalNullCount() const {
- if (this->buffers[0]) {
+ if (this->buffers[0] && this->type->id() != Type::DICTIONARY) {
return GetNullCount();
}
return ArraySpan(*this).ComputeLogicalNullCount();
@@ -542,6 +547,9 @@ int64_t ArraySpan::ComputeLogicalNullCount() const {
if (t == Type::RUN_END_ENCODED) {
return ree_util::LogicalNullCount(*this);
}
+ if (t == Type::DICTIONARY) {
+ return dict_util::LogicalNullCount(*this);
+ }
return GetNullCount();
}
@@ -639,6 +647,10 @@ bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
}
+bool ArraySpan::DictionaryMayHaveLogicalNulls() const {
+ return this->GetNullCount() != 0 || this->dictionary().GetNullCount() != 0;
+}
+
// ----------------------------------------------------------------------
// Implement internal::GetArrayView
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index 40a77640cd..4c2df83814 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -38,7 +38,7 @@ struct ArrayData;
namespace internal {
// ----------------------------------------------------------------------
-// Null handling for types without a validity bitmap
+// Null handling for types without a validity bitmap and the dictionary type
ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
@@ -46,6 +46,7 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data,
int64_t i);
ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
+ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
} // namespace internal
// When slicing, we do not know the null count of the sliced range without
@@ -280,7 +281,7 @@ struct ARROW_EXPORT ArrayData {
/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
- /// nulls
+ /// nulls, or if the dictionary of dictionay array may have nulls.
///
/// This is not a drop-in replacement for MayHaveNulls, as historically
/// MayHaveNulls() has been used to check for the presence of a validity
@@ -325,6 +326,9 @@ struct ARROW_EXPORT ArrayData {
if (t == Type::RUN_END_ENCODED) {
return internal::RunEndEncodedMayHaveLogicalNulls(*this);
}
+ if (t == Type::DICTIONARY) {
+ return internal::DictionaryMayHaveLogicalNulls(*this);
+ }
return null_count.load() != 0;
}
@@ -505,7 +509,7 @@ struct ARROW_EXPORT ArraySpan {
/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
- /// nulls
+ /// nulls, or if the dictionary of dictionay array may have nulls.
///
/// \see ArrayData::MayHaveLogicalNulls
bool MayHaveLogicalNulls() const {
@@ -519,6 +523,9 @@ struct ARROW_EXPORT ArraySpan {
if (t == Type::RUN_END_ENCODED) {
return RunEndEncodedMayHaveLogicalNulls();
}
+ if (t == Type::DICTIONARY) {
+ return DictionaryMayHaveLogicalNulls();
+ }
return null_count != 0;
}
@@ -560,6 +567,7 @@ struct ARROW_EXPORT ArraySpan {
bool UnionMayHaveLogicalNulls() const;
bool RunEndEncodedMayHaveLogicalNulls() const;
+ bool DictionaryMayHaveLogicalNulls() const;
};
namespace internal {
diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc
new file mode 100644
index 0000000000..feab2324a4
--- /dev/null
+++ b/cpp/src/arrow/util/dict_util.cc
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/dict_util.h"
+#include "arrow/array/array_dict.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace dict_util {
+
+namespace {
+
+template <typename IndexArrowType>
+int64_t LogicalNullCount(const ArraySpan& span) {
+ const auto* indices_null_bit_map = span.buffers[0].data;
+ const auto& dictionary_span = span.dictionary();
+ const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data;
+
+ using CType = typename IndexArrowType::c_type;
+ const CType* indices_data = span.GetValues<CType>(1);
+ int64_t null_count = 0;
+ for (int64_t i = 0; i < span.length; i++) {
+ if (indices_null_bit_map != nullptr &&
+ !bit_util::GetBit(indices_null_bit_map, i + span.offset)) {
+ null_count++;
+ continue;
+ }
+
+ CType current_index = indices_data[i];
+ if (!bit_util::GetBit(dictionary_null_bit_map,
+ current_index + dictionary_span.offset)) {
+ null_count++;
+ }
+ }
+ return null_count;
+}
+
+} // namespace
+
+int64_t LogicalNullCount(const ArraySpan& span) {
+ if (span.dictionary().GetNullCount() == 0 || span.length == 0) {
+ return span.GetNullCount();
+ }
+
+ const auto& dict_array_type = internal::checked_cast<const
DictionaryType&>(*span.type);
+ switch (dict_array_type.index_type()->id()) {
+ case Type::UINT8:
+ return LogicalNullCount<UInt8Type>(span);
+ case Type::INT8:
+ return LogicalNullCount<Int8Type>(span);
+ case Type::UINT16:
+ return LogicalNullCount<UInt16Type>(span);
+ case Type::INT16:
+ return LogicalNullCount<Int16Type>(span);
+ case Type::UINT32:
+ return LogicalNullCount<UInt32Type>(span);
+ case Type::INT32:
+ return LogicalNullCount<Int32Type>(span);
+ case Type::UINT64:
+ return LogicalNullCount<UInt64Type>(span);
+ default:
+ return LogicalNullCount<Int64Type>(span);
+ }
+}
+} // namespace dict_util
+} // namespace arrow
diff --git a/cpp/src/arrow/util/dict_util.h b/cpp/src/arrow/util/dict_util.h
new file mode 100644
index 0000000000..a92733ae0f
--- /dev/null
+++ b/cpp/src/arrow/util/dict_util.h
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/array/data.h"
+
+namespace arrow {
+namespace dict_util {
+
+int64_t LogicalNullCount(const ArraySpan& span);
+
+} // namespace dict_util
+} // namespace arrow