This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 9a005324 feat: Add ArrowArrayViewComputeNullCount (#562)
9a005324 is described below

commit 9a0053242f29106c5ca5b78db5ccb2baa3ff4f0a
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Fri Jul 19 16:16:10 2024 -0500

    feat: Add ArrowArrayViewComputeNullCount (#562)
    
    Adds `ArrowArrayViewComputeNullCount()` and tests. Extracted from
    https://github.com/apache/arrow-nanoarrow/pull/555
---
 python/src/nanoarrow/_lib.pyx       |  5 +-
 src/nanoarrow/common/array_test.cc  | 93 ++++++++++++++++++++++++++++++++++++-
 src/nanoarrow/common/inline_array.h | 25 ++++++++++
 src/nanoarrow/nanoarrow.h           |  4 ++
 src/nanoarrow/nanoarrow_testing.hpp | 14 +-----
 thirdparty/zlib/CMakeLists.txt      |  1 +
 6 files changed, 124 insertions(+), 18 deletions(-)

diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 75a168b6..7cef1d6a 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -365,10 +365,7 @@ cdef class CArrayView:
         elif validity_bits == NULL:
             self._ptr.null_count = 0
         elif self._device is DEVICE_CPU:
-            self._ptr.null_count = (
-                self._ptr.length -
-                ArrowBitCountSet(validity_bits, self.offset, self.length)
-            )
+            self._ptr.null_count = ArrowArrayViewComputeNullCount(self._ptr)
 
         return self._ptr.null_count
 
diff --git a/src/nanoarrow/common/array_test.cc 
b/src/nanoarrow/common/array_test.cc
index 7b6d37e5..e55bb6a5 100644
--- a/src/nanoarrow/common/array_test.cc
+++ b/src/nanoarrow/common/array_test.cc
@@ -1816,6 +1816,97 @@ TEST(ArrayTest, ArrayViewTestBasic) {
   ArrowArrayViewReset(&array_view);
 }
 
+TEST(ArrayTest, ArrayViewTestComputeNullCount) {
+  struct ArrowError error;
+
+  int32_t values[] = {17, 87, 23, 53};
+  uint8_t all_valid = 0b1111'1111;
+  uint8_t all_null = 0b0000'0000;
+  uint8_t half_valid = 0b1010'1010;
+  uint8_t* all_valid_because_missing = nullptr;
+
+  const void* buffers[2];
+  buffers[1] = values;
+
+  nanoarrow::UniqueArray array;
+  array->length = 4;
+  array->offset = 0;
+  array->n_buffers = 2;
+  array->n_children = 0;
+  array->buffers = buffers;
+  array->children = nullptr;
+  array->dictionary = nullptr;
+  array->release = [](struct ArrowArray*) {};
+
+  for (auto [buffer, null_count] : {
+           std::pair{&all_valid, int64_t(0)},
+           std::pair{&all_null, array->length},
+           std::pair{&half_valid, array->length / 2},
+           std::pair{all_valid_because_missing, int64_t(0)},
+       }) {
+    array->null_count = null_count;
+    buffers[0] = buffer;
+    nanoarrow::UniqueArrayView array_view;
+    ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_INT32);
+    EXPECT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), 
NANOARROW_OK)
+        << error.message;
+    EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), null_count);
+  }
+
+  array->length = 0;
+  array->null_count = 0;
+  buffers[0] = &all_null;
+  nanoarrow::UniqueArrayView array_view;
+  ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_INT32);
+  EXPECT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), 
NANOARROW_OK)
+      << error.message;
+  EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 0);
+}
+
+TEST(ArrayTest, ArrayViewTestComputeNullCountUnion) {
+  struct ArrowError error;
+
+  // Build a simple union with one int and one string
+  nanoarrow::UniqueSchema schema;
+  ArrowSchemaInit(schema.get());
+  ASSERT_EQ(ArrowSchemaSetTypeUnion(schema.get(), NANOARROW_TYPE_DENSE_UNION, 
2),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema->children[0], NANOARROW_TYPE_INT32), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_STRING), 
NANOARROW_OK);
+
+  nanoarrow::UniqueArray array;
+  ASSERT_EQ(ArrowArrayInitFromSchema(array.get(), schema.get(), &error), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(array.get()), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendNull(array->children[0], 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(array.get(), 0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendNull(array->children[1], 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(array.get(), 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuildingDefault(array.get(), &error), 
NANOARROW_OK);
+
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), 
&error),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 0);
+}
+
+TEST(ArrayTest, ArrayViewTestComputeNullCountNull) {
+  struct ArrowError error;
+  nanoarrow::UniqueArray array;
+  ASSERT_EQ(ArrowArrayInitFromType(array.get(), NANOARROW_TYPE_NA), 
NANOARROW_OK);
+
+  EXPECT_EQ(ArrowArrayStartAppending(array.get()), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayAppendNull(array.get(), 11), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayAppendNull(array.get(), 42), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayFinishBuildingDefault(array.get(), &error), NANOARROW_OK)
+      << error.message;
+
+  nanoarrow::UniqueArrayView array_view;
+  ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_NA);
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 53);
+}
+
 TEST(ArrayTest, ArrayViewTestMove) {
   struct ArrowArrayView array_view;
   ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_STRING);
@@ -2407,7 +2498,7 @@ TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
   ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
   ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
 
-  // The ArrayView for a union could in theroy be created without a schema.
+  // The ArrayView for a union could in theory be created without a schema.
   // Currently FULL validation will fail here since we can't guarantee that
   // these are valid.
   ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_DENSE_UNION);
diff --git a/src/nanoarrow/common/inline_array.h 
b/src/nanoarrow/common/inline_array.h
index fda778e3..9fab5b64 100644
--- a/src/nanoarrow/common/inline_array.h
+++ b/src/nanoarrow/common/inline_array.h
@@ -740,6 +740,31 @@ static inline int8_t ArrowArrayViewIsNull(const struct 
ArrowArrayView* array_vie
   }
 }
 
+static inline int64_t ArrowArrayViewComputeNullCount(
+    const struct ArrowArrayView* array_view) {
+  if (array_view->length == 0) {
+    return 0;
+  }
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_NA:
+      return array_view->length;
+    case NANOARROW_TYPE_DENSE_UNION:
+    case NANOARROW_TYPE_SPARSE_UNION:
+      // Unions are "never null" in Arrow land
+      return 0;
+    default:
+      break;
+  }
+
+  const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8;
+  if (validity_buffer == NULL) {
+    return 0;
+  }
+  return array_view->length -
+         ArrowBitCountSet(validity_buffer, array_view->offset, 
array_view->length);
+}
+
 static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* 
array_view,
                                                int64_t i) {
   switch (array_view->storage_type) {
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 58cb877e..d19c8f9b 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -1071,6 +1071,10 @@ void ArrowArrayViewReset(struct ArrowArrayView* 
array_view);
 static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* 
array_view,
                                           int64_t i);
 
+/// \brief Compute null count for an ArrowArrayView
+static inline int64_t ArrowArrayViewComputeNullCount(
+    const struct ArrowArrayView* array_view);
+
 /// \brief Get the type id of a union array element
 static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* 
array_view,
                                                int64_t i);
diff --git a/src/nanoarrow/nanoarrow_testing.hpp 
b/src/nanoarrow/nanoarrow_testing.hpp
index 73f63298..d59bea0a 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -2013,20 +2013,8 @@ class TestingJSONReader {
       ArrowBufferView* buffer_view = array_view->buffer_views + i;
       buffer_view->data.as_uint8 = buffer->data;
       buffer_view->size_bytes = buffer->size_bytes;
-
-      // If this is a validity buffer, set the null_count
-      if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY 
&&
-          _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
-        array_view->null_count =
-            array_view->length -
-            ArrowBitCountSet(buffer_view->data.as_uint8, 0, 
array_view->length);
-      }
-    }
-
-    // The null type doesn't have any buffers but we can set the null_count
-    if (array_view->storage_type == NANOARROW_TYPE_NA) {
-      array_view->null_count = array_view->length;
     }
+    array_view->null_count = ArrowArrayViewComputeNullCount(array_view);
 
     // If there is a dictionary associated with schema, parse its value into 
dictionary
     if (schema->dictionary != nullptr) {
diff --git a/thirdparty/zlib/CMakeLists.txt b/thirdparty/zlib/CMakeLists.txt
index bd95b127..5769855e 100644
--- a/thirdparty/zlib/CMakeLists.txt
+++ b/thirdparty/zlib/CMakeLists.txt
@@ -23,3 +23,4 @@ fetchcontent_makeavailable(nanoarrow_zlib)
 
 add_library(ZLIB::ZLIB ALIAS zlibstatic)
 target_include_directories(zlibstatic INTERFACE ${zlib_BINARY_DIR} 
${zlib_SOURCE_DIR})
+target_include_directories(zlib INTERFACE ${zlib_BINARY_DIR} 
${zlib_SOURCE_DIR})

Reply via email to