This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 20e5bae  feat(extensions/nanoarrow_ipc): Add option to validate arrays 
at `NANOARROW_VALIDATION_LEVEL_FULL` (#177)
20e5bae is described below

commit 20e5bae8db24737857e725801d6a965ed21a4f7a
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Mar 30 09:49:57 2023 -0400

    feat(extensions/nanoarrow_ipc): Add option to validate arrays at 
`NANOARROW_VALIDATION_LEVEL_FULL` (#177)
    
    ...and do it by default in tests and in the reader.
    
    I am going to punt on full validation of unions with custom type IDs
    that are constructed using ArrowArraySetBuffer() (#178) because it
    involves a tiny bit of rethinking (there is currently no place to store
    custom type ids in the ArrowArray private data).
---
 .../nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h    | 11 ++++++++
 .../src/nanoarrow/nanoarrow_ipc_decoder.c          | 13 +++++++---
 .../src/nanoarrow/nanoarrow_ipc_decoder_test.cc    | 30 +++++++++++++++++++---
 .../src/nanoarrow/nanoarrow_ipc_reader.c           | 14 ++++++++--
 src/nanoarrow/array.c                              | 17 +++++++-----
 5 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
index 3fa4163..85a765a 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
@@ -43,6 +43,8 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArray)
 #define ArrowIpcDecoderDecodeArrayFromShared \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayFromShared)
+#define ArrowIpcDecoderValidateArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderValidateArray)
 #define ArrowIpcDecoderSetSchema \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetSchema)
 #define ArrowIpcDecoderSetEndianness \
@@ -275,6 +277,15 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared(struct 
ArrowIpcDecoder* deco
                                                     int64_t i, struct 
ArrowArray* out,
                                                     struct ArrowError* error);
 
+/// \brief Validate a decoded ArrowArray
+///
+/// Verifies buffer lengths and contents depending on validation_level. Users
+/// are reccomended to use NANOARROW_VALIDATION_LEVEL_FULL as any lesser value
+/// may result in some types of corrupted data crashing a process on read.
+ArrowErrorCode ArrowIpcDecoderValidateArray(struct ArrowArray* decoded,
+                                            enum ArrowValidationLevel 
validation_level,
+                                            struct ArrowError* error);
+
 /// \brief An user-extensible input data source
 struct ArrowIpcInputStream {
   /// \brief Read up to buf_size_bytes from stream into buf
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
index db4169c..639422e 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
@@ -1378,10 +1378,9 @@ static ArrowErrorCode ArrowIpcDecoderDecodeArrayInternal(
     }
   }
 
-  // TODO: this performs some validation but doesn't do everything we need it 
to do
-  // notably it doesn't loop over offset buffers to look for values that will 
cause
-  // out-of-bounds buffer access on the data buffer or child arrays.
-  result = ArrowArrayFinishBuildingDefault(&temp, error);
+  // Finish building to flush internal pointers but defer validation to
+  // ArrowIpcDecoderValidateArray()
+  result = ArrowArrayFinishBuilding(&temp, NANOARROW_VALIDATION_LEVEL_NONE, 
error);
   if (result != NANOARROW_OK) {
     temp.release(&temp);
     return result;
@@ -1406,3 +1405,9 @@ ArrowErrorCode 
ArrowIpcDecoderDecodeArrayFromShared(struct ArrowIpcDecoder* deco
   return ArrowIpcDecoderDecodeArrayInternal(
       decoder, ArrowIpcBufferFactoryFromShared(body), i, out, error);
 }
+
+ArrowErrorCode ArrowIpcDecoderValidateArray(struct ArrowArray* decoded,
+                                            enum ArrowValidationLevel 
validation_level,
+                                            struct ArrowError* error) {
+  return ArrowArrayFinishBuilding(decoded, validation_level, error);
+}
diff --git 
a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
index d125bf4..824a339 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
@@ -295,6 +295,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeSimpleRecordBatch) 
{
   // Check full struct extract
   EXPECT_EQ(ArrowIpcDecoderDecodeArray(&decoder, body, -1, &array, nullptr),
             NANOARROW_OK);
+  EXPECT_EQ(
+      ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
nullptr),
+      NANOARROW_OK);
   EXPECT_EQ(array.length, 3);
   EXPECT_EQ(array.null_count, 0);
   ASSERT_EQ(array.n_children, 1);
@@ -309,6 +312,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeSimpleRecordBatch) 
{
 
   // Check field extract
   EXPECT_EQ(ArrowIpcDecoderDecodeArray(&decoder, body, 0, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(
+      ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
nullptr),
+      NANOARROW_OK);
   ASSERT_EQ(array.n_buffers, 2);
   ASSERT_EQ(array.length, 3);
   EXPECT_EQ(array.null_count, 0);
@@ -478,6 +484,9 @@ TEST(NanoarrowIpcTest, 
NanoarrowIpcDecodeSimpleRecordBatchFromShared) {
   // Check full struct extract
   EXPECT_EQ(ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, -1, 
&array, nullptr),
             NANOARROW_OK);
+  EXPECT_EQ(
+      ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
nullptr),
+      NANOARROW_OK);
 
   EXPECT_EQ(array.length, 3);
   EXPECT_EQ(array.null_count, 0);
@@ -494,6 +503,10 @@ TEST(NanoarrowIpcTest, 
NanoarrowIpcDecodeSimpleRecordBatchFromShared) {
   // Check field extract
   EXPECT_EQ(ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, 0, &array, 
nullptr),
             NANOARROW_OK);
+  EXPECT_EQ(
+      ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
nullptr),
+      NANOARROW_OK);
+
   // Release the original shared (forthcoming array buffers should still be 
valid)
   ArrowIpcSharedBufferReset(&shared);
 
@@ -543,8 +556,12 @@ TEST(NanoarrowIpcTest, 
NanoarrowIpcSharedBufferThreadSafeDecode) {
 
   struct ArrowArray arrays[10];
   for (int i = 0; i < 10; i++) {
-    ASSERT_EQ(ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, -1, 
arrays + i, nullptr),
-            NANOARROW_OK);
+    ASSERT_EQ(
+        ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, -1, arrays + 
i, nullptr),
+        NANOARROW_OK);
+    ASSERT_EQ(ArrowIpcDecoderValidateArray(arrays + i, 
NANOARROW_VALIDATION_LEVEL_FULL,
+                                           nullptr),
+              NANOARROW_OK);
   }
 
   // Clean up
@@ -556,7 +573,8 @@ TEST(NanoarrowIpcTest, 
NanoarrowIpcSharedBufferThreadSafeDecode) {
   std::thread threads[10];
   for (int i = 0; i < 10; i++) {
     threads[i] = std::thread([&arrays, i, &one_two_three_le] {
-      memcmp(arrays[i].children[0]->buffers[1], one_two_three_le, 
sizeof(one_two_three_le));
+      memcmp(arrays[i].children[0]->buffers[1], one_two_three_le,
+             sizeof(one_two_three_le));
       arrays[i].release(arrays + i);
     });
   }
@@ -605,6 +623,9 @@ TEST_P(ArrowTypeParameterizedTestFixture, 
NanoarrowIpcArrowArrayRoundtrip) {
   buffer_view.size_bytes -= decoder.header_size_bytes;
   ASSERT_EQ(ArrowIpcDecoderDecodeArray(&decoder, buffer_view, -1, &array, 
nullptr),
             NANOARROW_OK);
+  ASSERT_EQ(
+      ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
nullptr),
+      NANOARROW_OK);
 
   auto maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema);
   ASSERT_TRUE(maybe_batch.ok());
@@ -622,6 +643,9 @@ TEST_P(ArrowTypeParameterizedTestFixture, 
NanoarrowIpcArrowArrayRoundtrip) {
   buffer_view.size_bytes -= decoder.header_size_bytes;
   ASSERT_EQ(ArrowIpcDecoderDecodeArray(&decoder, buffer_view, -1, &array, 
nullptr),
             NANOARROW_OK);
+  ASSERT_EQ(
+      ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
nullptr),
+      NANOARROW_OK);
 
   maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema);
   ASSERT_TRUE(maybe_batch.ok());
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
index 1b2ad87..0e457c7 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
@@ -365,11 +365,13 @@ static int ArrowIpcArrayStreamReaderGetNext(struct 
ArrowArrayStream* stream,
   // Read in the body
   NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextBody(private_data));
 
+  struct ArrowArray tmp;
+
   if (private_data->use_shared_buffers) {
     struct ArrowIpcSharedBuffer shared;
     NANOARROW_RETURN_NOT_OK(ArrowIpcSharedBufferInit(&shared, 
&private_data->body));
     NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayFromShared(
-        &private_data->decoder, &shared, private_data->field_index, out,
+        &private_data->decoder, &shared, private_data->field_index, &tmp,
         &private_data->error));
     ArrowIpcSharedBufferReset(&shared);
   } else {
@@ -378,10 +380,18 @@ static int ArrowIpcArrayStreamReaderGetNext(struct 
ArrowArrayStream* stream,
     body_view.size_bytes = private_data->body.size_bytes;
 
     NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArray(&private_data->decoder, 
body_view,
-                                                       
private_data->field_index, out,
+                                                       
private_data->field_index, &tmp,
                                                        &private_data->error));
   }
 
+  result = ArrowIpcDecoderValidateArray(&tmp, NANOARROW_VALIDATION_LEVEL_FULL,
+                                        &private_data->error);
+  if (result != NANOARROW_OK) {
+    tmp.release(&tmp);
+    return result;
+  }
+
+  ArrowArrayMove(&tmp, out);
   return NANOARROW_OK;
 }
 
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index fc88fd6..30422d2 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -930,11 +930,15 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct 
ArrowArrayView* array_view,
 
   if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION ||
       array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) {
-    // Check that we have valid type ids
-    if (array_view->union_type_id_map == NULL ||
-        
_ArrowParsedUnionTypeIdsWillEqualChildIndices(array_view->union_type_id_map,
-                                                      array_view->n_children,
-                                                      array_view->n_children)) 
{
+    // Check that we have valid type ids.
+    if (array_view->union_type_id_map == NULL) {
+      // If the union_type_id map is NULL
+      // (e.g., when using ArrowArrayInitFromType() + 
ArrowArrayAllocateChildren()
+      // + ArrowArrayFinishBuilding()), we don't have enough information to 
validate
+      // this buffer (GH-178).
+    } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices(
+                   array_view->union_type_id_map, array_view->n_children,
+                   array_view->n_children)) {
       
NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(array_view->buffer_views[0], 0,
                                                    array_view->n_children - 1, 
error));
     } else {
@@ -944,7 +948,8 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct 
ArrowArrayView* array_view,
     }
   }
 
-  if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
+  if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION &&
+      array_view->union_type_id_map != NULL) {
     // Check that offsets refer to child elements that actually exist
     for (int64_t i = 0; i < array_view->array->length; i++) {
       int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i);

Reply via email to