This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new be3b2b1  feat: Implement `ArrowArrayViewValidateFull()` (#174)
be3b2b1 is described below

commit be3b2b13a7e2ad977f6a1cad9a157e617bf42dcc
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Mar 29 08:05:50 2023 -0400

    feat: Implement `ArrowArrayViewValidateFull()` (#174)
    
    This is required for IPC reading because corrupted offset and/or union
    type ID buffers could result in consumers accessing out-of-bounds
    elements. `ArrowArrayViewSetArray()` already checked the *last* element
    of offset buffers against lengths but didn't check the first element and
    didn't check for negative sequential offsets.
    
    ---------
    
    Co-authored-by: Will Jones <[email protected]>
---
 src/nanoarrow/array.c           | 179 ++++++++++++++++++++++++++++++++++++++++
 src/nanoarrow/array_inline.h    |  14 +++-
 src/nanoarrow/array_test.cc     | 140 +++++++++++++++++++++++++++++++
 src/nanoarrow/nanoarrow.h       |   6 ++
 src/nanoarrow/nanoarrow_types.h |   3 +-
 5 files changed, 337 insertions(+), 5 deletions(-)

diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 976b39e..81675bb 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -622,6 +622,21 @@ ArrowErrorCode ArrowArrayViewSetArray(struct 
ArrowArrayView* array_view,
                                       struct ArrowArray* array,
                                       struct ArrowError* error) {
   array_view->array = array;
+
+  // Check length and offset
+  if (array->offset < 0) {
+    ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of 
%ld",
+                  (long)array->offset);
+    return EINVAL;
+  }
+
+  if (array->length < 0) {
+    ArrowErrorSet(error, "Expected array length >= 0 but found array length of 
%ld",
+                  (long)array->length);
+    return EINVAL;
+  }
+
+  // First pass setting lengths that do not depend on the data buffer
   ArrowArrayViewSetLength(array_view, array->offset + array->length);
 
   int64_t buffers_required = 0;
@@ -648,15 +663,25 @@ ArrowErrorCode ArrowArrayViewSetArray(struct 
ArrowArrayView* array_view,
   }
 
   if (array_view->n_children != array->n_children) {
+    ArrowErrorSet(error, "Expected %ld children but found %ld children",
+                  (long)array_view->n_children, (long)array->n_children);
     return EINVAL;
   }
 
   // Check child sizes and calculate sizes that depend on data in the array 
buffers
+  int64_t first_offset;
   int64_t last_offset;
   switch (array_view->storage_type) {
     case NANOARROW_TYPE_STRING:
     case NANOARROW_TYPE_BINARY:
       if (array_view->buffer_views[1].size_bytes != 0) {
+        first_offset = array_view->buffer_views[1].data.as_int32[0];
+        if (first_offset < 0) {
+          ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
+                        (long)first_offset);
+          return EINVAL;
+        }
+
         last_offset =
             array_view->buffer_views[1].data.as_int32[array->offset + 
array->length];
         array_view->buffer_views[2].size_bytes = last_offset;
@@ -665,6 +690,13 @@ ArrowErrorCode ArrowArrayViewSetArray(struct 
ArrowArrayView* array_view,
     case NANOARROW_TYPE_LARGE_STRING:
     case NANOARROW_TYPE_LARGE_BINARY:
       if (array_view->buffer_views[1].size_bytes != 0) {
+        first_offset = array_view->buffer_views[1].data.as_int64[0];
+        if (first_offset < 0) {
+          ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
+                        (long)first_offset);
+          return EINVAL;
+        }
+
         last_offset =
             array_view->buffer_views[1].data.as_int64[array->offset + 
array->length];
         array_view->buffer_views[2].size_bytes = last_offset;
@@ -694,6 +726,13 @@ ArrowErrorCode ArrowArrayViewSetArray(struct 
ArrowArrayView* array_view,
       }
 
       if (array_view->buffer_views[1].size_bytes != 0) {
+        first_offset = array_view->buffer_views[1].data.as_int32[0];
+        if (first_offset < 0) {
+          ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
+                        (long)first_offset);
+          return EINVAL;
+        }
+
         last_offset =
             array_view->buffer_views[1].data.as_int32[array->offset + 
array->length];
         if (array->children[0]->length < last_offset) {
@@ -716,6 +755,13 @@ ArrowErrorCode ArrowArrayViewSetArray(struct 
ArrowArrayView* array_view,
       }
 
       if (array_view->buffer_views[1].size_bytes != 0) {
+        first_offset = array_view->buffer_views[1].data.as_int64[0];
+        if (first_offset < 0) {
+          ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
+                        (long)first_offset);
+          return EINVAL;
+        }
+
         last_offset =
             array_view->buffer_views[1].data.as_int64[array->offset + 
array->length];
         if (array->children[0]->length < last_offset) {
@@ -758,3 +804,136 @@ ArrowErrorCode ArrowArrayViewSetArray(struct 
ArrowArrayView* array_view,
 
   return NANOARROW_OK;
 }
+
+static int ArrowAssertIncreasingInt32(struct ArrowBufferView view,
+                                      struct ArrowError* error) {
+  if (view.size_bytes <= (int64_t)sizeof(int32_t)) {
+    return NANOARROW_OK;
+  }
+
+  for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) {
+    int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1];
+    if (diff < 0) {
+      ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element 
size %ld",
+                    (long)i, (long)diff);
+      return EINVAL;
+    }
+  }
+
+  return NANOARROW_OK;
+}
+
+static int ArrowAssertIncreasingInt64(struct ArrowBufferView view,
+                                      struct ArrowError* error) {
+  if (view.size_bytes <= (int64_t)sizeof(int64_t)) {
+    return NANOARROW_OK;
+  }
+
+  for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) {
+    int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1];
+    if (diff < 0) {
+      ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element 
size %ld",
+                    (long)i, (long)diff);
+      return EINVAL;
+    }
+  }
+
+  return NANOARROW_OK;
+}
+
+static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value,
+                                int8_t max_value, struct ArrowError* error) {
+  for (int64_t i = 0; i < view.size_bytes; i++) {
+    if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) {
+      ArrowErrorSet(error,
+                    "[%ld] Expected buffer value between %d and %d but found 
value %d",
+                    (long)i, (int)min_value, (int)max_value, 
(int)view.data.as_int8[i]);
+      return EINVAL;
+    }
+  }
+
+  return NANOARROW_OK;
+}
+
+static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values,
+                             int64_t n_values, struct ArrowError* error) {
+  for (int64_t i = 0; i < view.size_bytes; i++) {
+    int item_found = 0;
+    for (int64_t j = 0; j < n_values; j++) {
+      if (view.data.as_int8[i] == values[j]) {
+        item_found = 1;
+        break;
+      }
+    }
+
+    if (!item_found) {
+      ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i,
+                    (int)view.data.as_int8[i]);
+      return EINVAL;
+    }
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
+                                          struct ArrowError* error) {
+  for (int i = 0; i < 3; i++) {
+    switch (array_view->layout.buffer_type[i]) {
+      case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
+        NANOARROW_RETURN_NOT_OK(
+            ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
+        break;
+      case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
+        if (array_view->layout.element_size_bits[i] == 32) {
+          NANOARROW_RETURN_NOT_OK(
+              ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
+        } else {
+          NANOARROW_RETURN_NOT_OK(
+              ArrowAssertIncreasingInt64(array_view->buffer_views[i], error));
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION ||
+      array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) {
+    // Check that we have valid type ids
+    if (array_view->union_type_id_map == NULL ||
+        
_ArrowParsedUnionTypeIdsWillEqualChildIndices(array_view->union_type_id_map,
+                                                      array_view->n_children,
+                                                      array_view->n_children)) 
{
+      
NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(array_view->buffer_views[0], 0,
+                                                   array_view->n_children - 1, 
error));
+    } else {
+      NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0],
+                                                array_view->union_type_id_map 
+ 128,
+                                                array_view->n_children, 
error));
+    }
+  }
+
+  if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
+    // Check that offsets refer to child elements that actually exist
+    for (int64_t i = 0; i < array_view->array->length; i++) {
+      int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i);
+      int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i);
+      int64_t child_length = array_view->array->children[child_id]->length;
+      if (offset < 0 || offset > child_length) {
+        ArrowErrorSet(
+            error,
+            "[%ld] Expected union offset for child id %d to be between 0 and 
%ld but "
+            "found offset value %ld",
+            (long)i, (int)child_id, (long)child_length, offset);
+        return EINVAL;
+      }
+    }
+  }
+
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], 
error));
+  }
+
+  return NANOARROW_OK;
+}
diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h
index c6f569a..bcb2fe7 100644
--- a/src/nanoarrow/array_inline.h
+++ b/src/nanoarrow/array_inline.h
@@ -94,10 +94,9 @@ static inline int8_t _ArrowParseUnionTypeIds(const char* 
type_ids, int8_t* out)
   return -1;
 }
 
-static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* 
type_id_str,
-                                                             int64_t 
n_children) {
-  int8_t type_ids[128];
-  int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids);
+static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const 
int8_t* type_ids,
+                                                                   int64_t 
n_type_ids,
+                                                                   int64_t 
n_children) {
   if (n_type_ids != n_children) {
     return 0;
   }
@@ -111,6 +110,13 @@ static inline int8_t 
_ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id
   return 1;
 }
 
+static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* 
type_id_str,
+                                                             int64_t 
n_children) {
+  int8_t type_ids[128];
+  int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids);
+  return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, 
n_children);
+}
+
 static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* 
array) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 2a3fd2e..ce07da2 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -1267,6 +1267,7 @@ TEST(ArrayTest, ArrayViewTestBasic) {
   ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, 3 * sizeof(int32_t));
   EXPECT_EQ(array_view.buffer_views[1].data.as_int32[0], 11);
@@ -1279,9 +1280,21 @@ TEST(ArrayTest, ArrayViewTestBasic) {
   ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, 3 * sizeof(int32_t));
 
+  // Expect error for bad offset + length
+  array.length = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected array length >= 0 but found array 
length of -1");
+  array.length = 3;
+
+  array.offset = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected array offset >= 0 but found array 
offset of -1");
+  array.offset = 0;
+
   // Expect error for the wrong number of buffers
   ArrowArrayViewReset(&array_view);
   ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_STRING);
@@ -1338,6 +1351,7 @@ TEST(ArrayTest, ArrayViewTestString) {
   ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_STRING), 
NANOARROW_OK);
   array.null_count = 0;
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0);
@@ -1351,10 +1365,25 @@ TEST(ArrayTest, ArrayViewTestString) {
   ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, (1 + 1) * sizeof(int32_t));
   EXPECT_EQ(array_view.buffer_views[2].size_bytes, 4);
 
+  // Expect error for offsets that will cause bad access
+  int32_t* offsets =
+      const_cast<int32_t*>(reinterpret_cast<const int32_t*>(array.buffers[1]));
+
+  offsets[0] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1");
+  offsets[0] = 0;
+
+  offsets[1] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message, "[1] Expected element size >= 0 but found 
element size -1");
+
   array.release(&array);
   ArrowArrayViewReset(&array_view);
 }
@@ -1390,6 +1419,7 @@ TEST(ArrayTest, ArrayViewTestLargeString) {
   ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_STRING), 
NANOARROW_OK);
   array.null_count = 0;
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[2].size_bytes, 0);
@@ -1403,10 +1433,25 @@ TEST(ArrayTest, ArrayViewTestLargeString) {
   ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 0);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, (1 + 1) * sizeof(int64_t));
   EXPECT_EQ(array_view.buffer_views[2].size_bytes, 4);
 
+  // Expect error for offsets that will cause bad access
+  int64_t* offsets =
+      const_cast<int64_t*>(reinterpret_cast<const int64_t*>(array.buffers[1]));
+
+  offsets[0] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1");
+  offsets[0] = 0;
+
+  offsets[1] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message, "[1] Expected element size >= 0 but found 
element size -1");
+
   array.release(&array);
   ArrowArrayViewReset(&array_view);
 }
@@ -1462,6 +1507,36 @@ TEST(ArrayTest, ArrayViewTestList) {
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int32_t));
 
+  // Build a valid array
+  struct ArrowArray array;
+  ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_LIST), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAllocateChildren(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayInitFromType(array.children[0], NANOARROW_TYPE_INT32),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 1234), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
+
+  // Expect error for offsets that will cause bad access
+  struct ArrowError error;
+  int32_t* offsets =
+      const_cast<int32_t*>(reinterpret_cast<const int32_t*>(array.buffers[1]));
+
+  offsets[0] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1");
+  offsets[0] = 0;
+
+  offsets[1] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message, "[1] Expected element size >= 0 but found 
element size -1");
+
+  array.release(&array);
   ArrowArrayViewReset(&array_view);
 }
 
@@ -1485,6 +1560,36 @@ TEST(ArrayTest, ArrayViewTestLargeList) {
   EXPECT_EQ(array_view.buffer_views[0].size_bytes, 1);
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, (5 + 1) * sizeof(int64_t));
 
+  // Build a valid array
+  struct ArrowArray array;
+  ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_LARGE_LIST), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAllocateChildren(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayInitFromType(array.children[0], NANOARROW_TYPE_INT32),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 1234), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
+
+  // Expect error for offsets that will cause bad access
+  struct ArrowError error;
+  int64_t* offsets =
+      const_cast<int64_t*>(reinterpret_cast<const int64_t*>(array.buffers[1]));
+
+  offsets[0] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected first offset >= 0 but found -1");
+  offsets[0] = 0;
+
+  offsets[1] = -1;
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message, "[1] Expected element size >= 0 but found 
element size -1");
+
+  array.release(&array);
   ArrowArrayViewReset(&array_view);
 }
 
@@ -1544,6 +1649,7 @@ TEST(ArrayTest, ArrayViewTestStructArray) {
   ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.children[0]->buffer_views[1].size_bytes, 
sizeof(int32_t));
   EXPECT_EQ(array_view.children[0]->buffer_views[1].data.as_int32[0], 123);
 
@@ -1586,6 +1692,7 @@ TEST(ArrayTest, ArrayViewTestFixedSizeListArray) {
   ASSERT_EQ(ArrowArrayFinishBuilding(&array, &error), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
   EXPECT_EQ(array_view.children[0]->buffer_views[1].size_bytes, 3 * 
sizeof(int32_t));
   EXPECT_EQ(array_view.children[0]->buffer_views[1].data.as_int32[0], 123);
 
@@ -1622,6 +1729,7 @@ TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
   ArrowArrayViewInitFromType(array_view.children[0], NANOARROW_TYPE_INT32);
   ArrowArrayViewInitFromType(array_view.children[1], NANOARROW_TYPE_STRING);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
   EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
@@ -1633,12 +1741,32 @@ TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
   // The test schema explicitly sets the type_ids 0,1 and this should work too
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
   EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
   EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
   EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
 
+  // Check that bad type ids/offset are caught by validate full
+  struct ArrowError error;
+  int8_t* type_ids =
+      const_cast<int8_t*>(reinterpret_cast<const int8_t*>(array.buffers[0]));
+  int32_t* offsets =
+      const_cast<int32_t*>(reinterpret_cast<const int32_t*>(array.buffers[1]));
+  type_ids[0] = -1;
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message,
+               "[0] Expected buffer value between 0 and 1 but found value -1");
+  type_ids[0] = 0;
+
+  offsets[0] = -1;
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message,
+               "[0] Expected union offset for child id 0 to be between 0 and 1 
but found "
+               "offset value -1");
+  offsets[0] = 0;
+
   ArrowArrayViewReset(&array_view);
 
   // Reversing the type ids should result in the same type ids but
@@ -1646,12 +1774,19 @@ TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
   ASSERT_EQ(ArrowSchemaSetFormat(&schema, "+ud:1,0"), NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
   EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
   EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 1);
   EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 0);
 
+  // Check that bad type ids are caught by validate full
+  type_ids[0] = -1;
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), EINVAL);
+  EXPECT_STREQ(error.message, "[0] Unexpected buffer value -1");
+  type_ids[0] = 0;
+
   ArrowArrayViewReset(&array_view);
 
   // Check the raw mapping in the array view for numbers that are easier to 
check
@@ -1692,6 +1827,7 @@ TEST(ArrayTest, ArrayViewTestDenseUnionGet) {
   // Initialize the array view
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
 
   // Check the values that will be used to index into children
   EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
@@ -1736,6 +1872,7 @@ TEST(ArrayTest, ArrayViewTestSparseUnionGet) {
   // Initialize the array view
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, nullptr), NANOARROW_OK);
 
   // Check the values that will be used to index into children
   EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
@@ -1776,6 +1913,7 @@ void TestGetFromNumericArrayView() {
   ARROW_EXPECT_OK(ExportArray(*arrow_array, &array, &schema));
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
   EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);
@@ -1806,6 +1944,7 @@ void TestGetFromNumericArrayView() {
   ARROW_EXPECT_OK(ExportArray(*arrow_array, &array, &schema));
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
 
   // We're trying to test behavior with no validity buffer, so make sure 
that's true
   ASSERT_EQ(array_view.buffer_views[0].data.data, nullptr);
@@ -1852,6 +1991,7 @@ void TestGetFromBinary(BuilderClass& builder) {
   ARROW_EXPECT_OK(ExportArray(*arrow_array, &array, &schema));
   ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error), 
NANOARROW_OK);
   ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewValidateFull(&array_view, &error), NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
   EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 1d2a721..558d3d1 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -112,6 +112,8 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength)
 #define ArrowArrayViewSetArray \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray)
+#define ArrowArrayViewValidateFull \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidateFull)
 #define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, 
ArrowArrayViewReset)
 #define ArrowBasicArrayStreamInit \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit)
@@ -893,6 +895,10 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* 
array_view, int64_t length);
 ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view,
                                       struct ArrowArray* array, struct 
ArrowError* error);
 
+/// \brief Performs extra checks on the array that was set via 
ArrowArrayViewSetArray()
+ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
+                                          struct ArrowError* error);
+
 /// \brief Reset the contents of an ArrowArrayView and frees resources
 void ArrowArrayViewReset(struct ArrowArrayView* array_view);
 
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 752c6d6..b9af6bb 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -503,7 +503,8 @@ struct ArrowArrayView {
   ///
   /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer
   /// such that child_index == union_type_id_map[type_id] and
-  /// type_id == union_type_id_map[128 + child_index]
+  /// type_id == union_type_id_map[128 + child_index]. This value may be
+  /// NULL in the case where child_id == type_id.
   int8_t* union_type_id_map;
 };
 

Reply via email to