This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9a005324 feat: Add ArrowArrayViewComputeNullCount (#562)
9a005324 is described below
commit 9a0053242f29106c5ca5b78db5ccb2baa3ff4f0a
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Fri Jul 19 16:16:10 2024 -0500
feat: Add ArrowArrayViewComputeNullCount (#562)
Adds `ArrowArrayViewComputeNullCount()` and tests. Extracted from
https://github.com/apache/arrow-nanoarrow/pull/555
---
python/src/nanoarrow/_lib.pyx | 5 +-
src/nanoarrow/common/array_test.cc | 93 ++++++++++++++++++++++++++++++++++++-
src/nanoarrow/common/inline_array.h | 25 ++++++++++
src/nanoarrow/nanoarrow.h | 4 ++
src/nanoarrow/nanoarrow_testing.hpp | 14 +-----
thirdparty/zlib/CMakeLists.txt | 1 +
6 files changed, 124 insertions(+), 18 deletions(-)
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 75a168b6..7cef1d6a 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -365,10 +365,7 @@ cdef class CArrayView:
elif validity_bits == NULL:
self._ptr.null_count = 0
elif self._device is DEVICE_CPU:
- self._ptr.null_count = (
- self._ptr.length -
- ArrowBitCountSet(validity_bits, self.offset, self.length)
- )
+ self._ptr.null_count = ArrowArrayViewComputeNullCount(self._ptr)
return self._ptr.null_count
diff --git a/src/nanoarrow/common/array_test.cc
b/src/nanoarrow/common/array_test.cc
index 7b6d37e5..e55bb6a5 100644
--- a/src/nanoarrow/common/array_test.cc
+++ b/src/nanoarrow/common/array_test.cc
@@ -1816,6 +1816,97 @@ TEST(ArrayTest, ArrayViewTestBasic) {
ArrowArrayViewReset(&array_view);
}
+TEST(ArrayTest, ArrayViewTestComputeNullCount) {
+ struct ArrowError error;
+
+ int32_t values[] = {17, 87, 23, 53};
+ uint8_t all_valid = 0b1111'1111;
+ uint8_t all_null = 0b0000'0000;
+ uint8_t half_valid = 0b1010'1010;
+ uint8_t* all_valid_because_missing = nullptr;
+
+ const void* buffers[2];
+ buffers[1] = values;
+
+ nanoarrow::UniqueArray array;
+ array->length = 4;
+ array->offset = 0;
+ array->n_buffers = 2;
+ array->n_children = 0;
+ array->buffers = buffers;
+ array->children = nullptr;
+ array->dictionary = nullptr;
+ array->release = [](struct ArrowArray*) {};
+
+ for (auto [buffer, null_count] : {
+ std::pair{&all_valid, int64_t(0)},
+ std::pair{&all_null, array->length},
+ std::pair{&half_valid, array->length / 2},
+ std::pair{all_valid_because_missing, int64_t(0)},
+ }) {
+ array->null_count = null_count;
+ buffers[0] = buffer;
+ nanoarrow::UniqueArrayView array_view;
+ ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_INT32);
+ EXPECT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error),
NANOARROW_OK)
+ << error.message;
+ EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), null_count);
+ }
+
+ array->length = 0;
+ array->null_count = 0;
+ buffers[0] = &all_null;
+ nanoarrow::UniqueArrayView array_view;
+ ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_INT32);
+ EXPECT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error),
NANOARROW_OK)
+ << error.message;
+ EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 0);
+}
+
+TEST(ArrayTest, ArrayViewTestComputeNullCountUnion) {
+ struct ArrowError error;
+
+ // Build a simple union with one int and one string
+ nanoarrow::UniqueSchema schema;
+ ArrowSchemaInit(schema.get());
+ ASSERT_EQ(ArrowSchemaSetTypeUnion(schema.get(), NANOARROW_TYPE_DENSE_UNION,
2),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema->children[0], NANOARROW_TYPE_INT32),
NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_STRING),
NANOARROW_OK);
+
+ nanoarrow::UniqueArray array;
+ ASSERT_EQ(ArrowArrayInitFromSchema(array.get(), schema.get(), &error),
NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(array.get()), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(array->children[0], 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(array.get(), 0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(array->children[1], 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(array.get(), 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuildingDefault(array.get(), &error),
NANOARROW_OK);
+
+ nanoarrow::UniqueArrayView array_view;
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(),
&error),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error),
NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 0);
+}
+
+TEST(ArrayTest, ArrayViewTestComputeNullCountNull) {
+ struct ArrowError error;
+ nanoarrow::UniqueArray array;
+ ASSERT_EQ(ArrowArrayInitFromType(array.get(), NANOARROW_TYPE_NA),
NANOARROW_OK);
+
+ EXPECT_EQ(ArrowArrayStartAppending(array.get()), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendNull(array.get(), 11), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendNull(array.get(), 42), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(array.get(), &error), NANOARROW_OK)
+ << error.message;
+
+ nanoarrow::UniqueArrayView array_view;
+ ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_NA);
+ ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error),
NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 53);
+}
+
TEST(ArrayTest, ArrayViewTestMove) {
struct ArrowArrayView array_view;
ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_STRING);
@@ -2407,7 +2498,7 @@ TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
- // The ArrayView for a union could in theroy be created without a schema.
+ // The ArrayView for a union could in theory be created without a schema.
// Currently FULL validation will fail here since we can't guarantee that
// these are valid.
ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_DENSE_UNION);
diff --git a/src/nanoarrow/common/inline_array.h
b/src/nanoarrow/common/inline_array.h
index fda778e3..9fab5b64 100644
--- a/src/nanoarrow/common/inline_array.h
+++ b/src/nanoarrow/common/inline_array.h
@@ -740,6 +740,31 @@ static inline int8_t ArrowArrayViewIsNull(const struct
ArrowArrayView* array_vie
}
}
+static inline int64_t ArrowArrayViewComputeNullCount(
+ const struct ArrowArrayView* array_view) {
+ if (array_view->length == 0) {
+ return 0;
+ }
+
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ return array_view->length;
+ case NANOARROW_TYPE_DENSE_UNION:
+ case NANOARROW_TYPE_SPARSE_UNION:
+ // Unions are "never null" in Arrow land
+ return 0;
+ default:
+ break;
+ }
+
+ const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8;
+ if (validity_buffer == NULL) {
+ return 0;
+ }
+ return array_view->length -
+ ArrowBitCountSet(validity_buffer, array_view->offset,
array_view->length);
+}
+
static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView*
array_view,
int64_t i) {
switch (array_view->storage_type) {
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 58cb877e..d19c8f9b 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -1071,6 +1071,10 @@ void ArrowArrayViewReset(struct ArrowArrayView*
array_view);
static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView*
array_view,
int64_t i);
+/// \brief Compute null count for an ArrowArrayView
+static inline int64_t ArrowArrayViewComputeNullCount(
+ const struct ArrowArrayView* array_view);
+
/// \brief Get the type id of a union array element
static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView*
array_view,
int64_t i);
diff --git a/src/nanoarrow/nanoarrow_testing.hpp
b/src/nanoarrow/nanoarrow_testing.hpp
index 73f63298..d59bea0a 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -2013,20 +2013,8 @@ class TestingJSONReader {
ArrowBufferView* buffer_view = array_view->buffer_views + i;
buffer_view->data.as_uint8 = buffer->data;
buffer_view->size_bytes = buffer->size_bytes;
-
- // If this is a validity buffer, set the null_count
- if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY
&&
- _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
- array_view->null_count =
- array_view->length -
- ArrowBitCountSet(buffer_view->data.as_uint8, 0,
array_view->length);
- }
- }
-
- // The null type doesn't have any buffers but we can set the null_count
- if (array_view->storage_type == NANOARROW_TYPE_NA) {
- array_view->null_count = array_view->length;
}
+ array_view->null_count = ArrowArrayViewComputeNullCount(array_view);
// If there is a dictionary associated with schema, parse its value into
dictionary
if (schema->dictionary != nullptr) {
diff --git a/thirdparty/zlib/CMakeLists.txt b/thirdparty/zlib/CMakeLists.txt
index bd95b127..5769855e 100644
--- a/thirdparty/zlib/CMakeLists.txt
+++ b/thirdparty/zlib/CMakeLists.txt
@@ -23,3 +23,4 @@ fetchcontent_makeavailable(nanoarrow_zlib)
add_library(ZLIB::ZLIB ALIAS zlibstatic)
target_include_directories(zlibstatic INTERFACE ${zlib_BINARY_DIR}
${zlib_SOURCE_DIR})
+target_include_directories(zlib INTERFACE ${zlib_BINARY_DIR}
${zlib_SOURCE_DIR})