This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b1f1ef4cda GH-38457: [C++] Support LogicalNullCount for 
DictionaryArray (#38681)
b1f1ef4cda is described below

commit b1f1ef4cda17cabede37471f809e07119699fbdb
Author: Junming Chen <[email protected]>
AuthorDate: Mon Nov 27 22:32:27 2023 +0800

    GH-38457: [C++] Support LogicalNullCount for DictionaryArray (#38681)
    
    ### Rationale for this change
    Currently, the `null_count` only returns the number of null values in 
`indices`. We want to count the real nulls as the dictionary array is decoded.
    
    ### What changes are included in this PR?
    Add `DictionaryMayHaveLogicalNulls` and `ComputeLogicalNullCount` for 
DictionaryArray
    
    ### Are these changes tested?
    Yes
    
    ### Are there any user-facing changes?
    No.
    * Closes: #38457
    
    Lead-authored-by: Junming Chen <[email protected]>
    Co-authored-by: Benjamin Kietzman <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/CMakeLists.txt      |  1 +
 cpp/src/arrow/array/array_test.cc | 57 +++++++++++++++++++++++++++
 cpp/src/arrow/array/data.cc       | 14 ++++++-
 cpp/src/arrow/array/data.h        | 14 +++++--
 cpp/src/arrow/util/dict_util.cc   | 81 +++++++++++++++++++++++++++++++++++++++
 cpp/src/arrow/util/dict_util.h    | 28 ++++++++++++++
 6 files changed, 191 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index b64194aa6e..46a7aa9106 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -223,6 +223,7 @@ set(ARROW_SRCS
     util/debug.cc
     util/decimal.cc
     util/delimiting.cc
+    util/dict_util.cc
     util/float16.cc
     util/formatting.cc
     util/future.cc
diff --git a/cpp/src/arrow/array/array_test.cc 
b/cpp/src/arrow/array/array_test.cc
index be54d62fd7..974eb54d2c 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -80,6 +80,22 @@ class TestArray : public ::testing::Test {
   MemoryPool* pool_;
 };
 
+void CheckDictionaryNullCount(const std::shared_ptr<DataType>& dict_type,
+                              const std::string& input_dictionary_json,
+                              const std::string& input_index_json,
+                              const int64_t& expected_null_count,
+                              const int64_t& expected_logical_null_count,
+                              bool expected_may_have_nulls,
+                              bool expected_may_have_logical_nulls) {
+  std::shared_ptr<arrow::Array> arr =
+      DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json);
+
+  ASSERT_EQ(arr->null_count(), expected_null_count);
+  ASSERT_EQ(arr->ComputeLogicalNullCount(), expected_logical_null_count);
+  ASSERT_EQ(arr->data()->MayHaveNulls(), expected_may_have_nulls);
+  ASSERT_EQ(arr->data()->MayHaveLogicalNulls(), 
expected_may_have_logical_nulls);
+}
+
 TEST_F(TestArray, TestNullCount) {
   // These are placeholders
   auto data = std::make_shared<Buffer>(nullptr, 0);
@@ -127,6 +143,37 @@ TEST_F(TestArray, TestNullCount) {
   ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
   ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
   ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());
+
+  // Dictionary type
+  std::shared_ptr<arrow::DataType> type;
+  std::shared_ptr<arrow::DataType> dict_type;
+
+  for (const auto& index_type : all_dictionary_index_types()) {
+    ARROW_SCOPED_TRACE("index_type = ", index_type->ToString());
+
+    type = boolean();
+    dict_type = dictionary(index_type, type);
+    // no null value
+    CheckDictionaryNullCount(dict_type, "[]", "[]", 0, 0, false, false);
+    CheckDictionaryNullCount(dict_type, "[true, false]", "[0, 1, 0]", 0, 0, 
false, false);
+
+    // only indices contain null value
+    CheckDictionaryNullCount(dict_type, "[true, false]", "[null, 0, 1]", 1, 1, 
true,
+                             true);
+    CheckDictionaryNullCount(dict_type, "[true, false]", "[null, null]", 2, 2, 
true,
+                             true);
+
+    // only dictionary contains null value
+    CheckDictionaryNullCount(dict_type, "[null, true]", "[]", 0, 0, false, 
true);
+    CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0]", 0, 
2, false,
+                             true);
+
+    // both indices and dictionary contain null value
+    CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0, 
null]", 1, 3,
+                             true, true);
+    CheckDictionaryNullCount(dict_type, "[null, true, null, false]", "[null, 
1, 0, 2, 3]",
+                             1, 3, true, true);
+  }
 }
 
 TEST_F(TestArray, TestSlicePreservesAllNullCount) {
@@ -137,6 +184,16 @@ TEST_F(TestArray, TestSlicePreservesAllNullCount) {
   Int32Array arr(/*length=*/100, data, null_bitmap,
                  /*null_count*/ 100);
   EXPECT_EQ(arr.Slice(1, 99)->data()->null_count, arr.Slice(1, 99)->length());
+
+  // Dictionary type
+  std::shared_ptr<arrow::DataType> dict_type = dictionary(int64(), boolean());
+  std::shared_ptr<arrow::Array> dict_arr =
+      DictArrayFromJSON(dict_type, /*indices=*/"[null, 0, 0, 0, 0, 0, 1, 2, 0, 
0]",
+                        /*dictionary=*/"[null, true, false]");
+  ASSERT_EQ(dict_arr->null_count(), 1);
+  ASSERT_EQ(dict_arr->ComputeLogicalNullCount(), 8);
+  ASSERT_EQ(dict_arr->Slice(2, 8)->null_count(), 0);
+  ASSERT_EQ(dict_arr->Slice(2, 8)->ComputeLogicalNullCount(), 6);
 }
 
 TEST_F(TestArray, TestLength) {
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 3241dc5518..c002c0817b 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -33,6 +33,7 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/binary_view_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/dict_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/ree_util.h"
@@ -93,6 +94,10 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) 
{
   return ArraySpan(data).MayHaveLogicalNulls();
 }
 
+bool DictionaryMayHaveLogicalNulls(const ArrayData& data) {
+  return ArraySpan(data).MayHaveLogicalNulls();
+}
+
 BufferSpan PackVariadicBuffers(util::span<const std::shared_ptr<Buffer>> 
buffers) {
   return {const_cast<uint8_t*>(reinterpret_cast<const 
uint8_t*>(buffers.data())),
           static_cast<int64_t>(buffers.size() * 
sizeof(std::shared_ptr<Buffer>))};
@@ -174,7 +179,7 @@ int64_t ArrayData::GetNullCount() const {
 }
 
 int64_t ArrayData::ComputeLogicalNullCount() const {
-  if (this->buffers[0]) {
+  if (this->buffers[0] && this->type->id() != Type::DICTIONARY) {
     return GetNullCount();
   }
   return ArraySpan(*this).ComputeLogicalNullCount();
@@ -542,6 +547,9 @@ int64_t ArraySpan::ComputeLogicalNullCount() const {
   if (t == Type::RUN_END_ENCODED) {
     return ree_util::LogicalNullCount(*this);
   }
+  if (t == Type::DICTIONARY) {
+    return dict_util::LogicalNullCount(*this);
+  }
   return GetNullCount();
 }
 
@@ -639,6 +647,10 @@ bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
   return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
 }
 
+bool ArraySpan::DictionaryMayHaveLogicalNulls() const {
+  return this->GetNullCount() != 0 || this->dictionary().GetNullCount() != 0;
+}
+
 // ----------------------------------------------------------------------
 // Implement internal::GetArrayView
 
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index 40a77640cd..4c2df83814 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -38,7 +38,7 @@ struct ArrayData;
 
 namespace internal {
 // ----------------------------------------------------------------------
-// Null handling for types without a validity bitmap
+// Null handling for types without a validity bitmap and the dictionary type
 
 ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
 ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
@@ -46,6 +46,7 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, 
int64_t i);
 
 ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
 ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
+ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
 }  // namespace internal
 
 // When slicing, we do not know the null count of the sliced range without
@@ -280,7 +281,7 @@ struct ARROW_EXPORT ArrayData {
 
   /// \brief Return true if the validity bitmap may have 0's in it, or if the
   /// child arrays (in the case of types without a validity bitmap) may have
-  /// nulls
+  /// nulls, or if the dictionary of dictionay array may have nulls.
   ///
   /// This is not a drop-in replacement for MayHaveNulls, as historically
   /// MayHaveNulls() has been used to check for the presence of a validity
@@ -325,6 +326,9 @@ struct ARROW_EXPORT ArrayData {
     if (t == Type::RUN_END_ENCODED) {
       return internal::RunEndEncodedMayHaveLogicalNulls(*this);
     }
+    if (t == Type::DICTIONARY) {
+      return internal::DictionaryMayHaveLogicalNulls(*this);
+    }
     return null_count.load() != 0;
   }
 
@@ -505,7 +509,7 @@ struct ARROW_EXPORT ArraySpan {
 
   /// \brief Return true if the validity bitmap may have 0's in it, or if the
   /// child arrays (in the case of types without a validity bitmap) may have
-  /// nulls
+  /// nulls, or if the dictionary of dictionay array may have nulls.
   ///
   /// \see ArrayData::MayHaveLogicalNulls
   bool MayHaveLogicalNulls() const {
@@ -519,6 +523,9 @@ struct ARROW_EXPORT ArraySpan {
     if (t == Type::RUN_END_ENCODED) {
       return RunEndEncodedMayHaveLogicalNulls();
     }
+    if (t == Type::DICTIONARY) {
+      return DictionaryMayHaveLogicalNulls();
+    }
     return null_count != 0;
   }
 
@@ -560,6 +567,7 @@ struct ARROW_EXPORT ArraySpan {
 
   bool UnionMayHaveLogicalNulls() const;
   bool RunEndEncodedMayHaveLogicalNulls() const;
+  bool DictionaryMayHaveLogicalNulls() const;
 };
 
 namespace internal {
diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc
new file mode 100644
index 0000000000..feab2324a4
--- /dev/null
+++ b/cpp/src/arrow/util/dict_util.cc
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/dict_util.h"
+#include "arrow/array/array_dict.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace dict_util {
+
+namespace {
+
+template <typename IndexArrowType>
+int64_t LogicalNullCount(const ArraySpan& span) {
+  const auto* indices_null_bit_map = span.buffers[0].data;
+  const auto& dictionary_span = span.dictionary();
+  const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data;
+
+  using CType = typename IndexArrowType::c_type;
+  const CType* indices_data = span.GetValues<CType>(1);
+  int64_t null_count = 0;
+  for (int64_t i = 0; i < span.length; i++) {
+    if (indices_null_bit_map != nullptr &&
+        !bit_util::GetBit(indices_null_bit_map, i + span.offset)) {
+      null_count++;
+      continue;
+    }
+
+    CType current_index = indices_data[i];
+    if (!bit_util::GetBit(dictionary_null_bit_map,
+                          current_index + dictionary_span.offset)) {
+      null_count++;
+    }
+  }
+  return null_count;
+}
+
+}  // namespace
+
+int64_t LogicalNullCount(const ArraySpan& span) {
+  if (span.dictionary().GetNullCount() == 0 || span.length == 0) {
+    return span.GetNullCount();
+  }
+
+  const auto& dict_array_type = internal::checked_cast<const 
DictionaryType&>(*span.type);
+  switch (dict_array_type.index_type()->id()) {
+    case Type::UINT8:
+      return LogicalNullCount<UInt8Type>(span);
+    case Type::INT8:
+      return LogicalNullCount<Int8Type>(span);
+    case Type::UINT16:
+      return LogicalNullCount<UInt16Type>(span);
+    case Type::INT16:
+      return LogicalNullCount<Int16Type>(span);
+    case Type::UINT32:
+      return LogicalNullCount<UInt32Type>(span);
+    case Type::INT32:
+      return LogicalNullCount<Int32Type>(span);
+    case Type::UINT64:
+      return LogicalNullCount<UInt64Type>(span);
+    default:
+      return LogicalNullCount<Int64Type>(span);
+  }
+}
+}  // namespace dict_util
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/dict_util.h b/cpp/src/arrow/util/dict_util.h
new file mode 100644
index 0000000000..a92733ae0f
--- /dev/null
+++ b/cpp/src/arrow/util/dict_util.h
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/array/data.h"
+
+namespace arrow {
+namespace dict_util {
+
+int64_t LogicalNullCount(const ArraySpan& span);
+
+}  // namespace dict_util
+}  // namespace arrow

Reply via email to