This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 20e5bae feat(extensions/nanoarrow_ipc): Add option to validate arrays
at `NANOARROW_VALIDATION_LEVEL_FULL` (#177)
20e5bae is described below
commit 20e5bae8db24737857e725801d6a965ed21a4f7a
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Mar 30 09:49:57 2023 -0400
feat(extensions/nanoarrow_ipc): Add option to validate arrays at
`NANOARROW_VALIDATION_LEVEL_FULL` (#177)
...and do it by default in tests and in the reader.
I am going to punt on full validation of unions with custom type IDs
that are constructed using ArrowArraySetBuffer() (#178) because it
involves a tiny bit of rethinking (there is currently no place to store
custom type ids in the ArrowArray private data).
---
.../nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h | 11 ++++++++
.../src/nanoarrow/nanoarrow_ipc_decoder.c | 13 +++++++---
.../src/nanoarrow/nanoarrow_ipc_decoder_test.cc | 30 +++++++++++++++++++---
.../src/nanoarrow/nanoarrow_ipc_reader.c | 14 ++++++++--
src/nanoarrow/array.c | 17 +++++++-----
5 files changed, 70 insertions(+), 15 deletions(-)
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
index 3fa4163..85a765a 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc.h
@@ -43,6 +43,8 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArray)
#define ArrowIpcDecoderDecodeArrayFromShared \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayFromShared)
+#define ArrowIpcDecoderValidateArray \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderValidateArray)
#define ArrowIpcDecoderSetSchema \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetSchema)
#define ArrowIpcDecoderSetEndianness \
@@ -275,6 +277,15 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared(struct
ArrowIpcDecoder* deco
int64_t i, struct
ArrowArray* out,
struct ArrowError* error);
+/// \brief Validate a decoded ArrowArray
+///
+/// Verifies buffer lengths and contents depending on validation_level. Users
+/// are reccomended to use NANOARROW_VALIDATION_LEVEL_FULL as any lesser value
+/// may result in some types of corrupted data crashing a process on read.
+ArrowErrorCode ArrowIpcDecoderValidateArray(struct ArrowArray* decoded,
+ enum ArrowValidationLevel
validation_level,
+ struct ArrowError* error);
+
/// \brief An user-extensible input data source
struct ArrowIpcInputStream {
/// \brief Read up to buf_size_bytes from stream into buf
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
index db4169c..639422e 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
@@ -1378,10 +1378,9 @@ static ArrowErrorCode ArrowIpcDecoderDecodeArrayInternal(
}
}
- // TODO: this performs some validation but doesn't do everything we need it
to do
- // notably it doesn't loop over offset buffers to look for values that will
cause
- // out-of-bounds buffer access on the data buffer or child arrays.
- result = ArrowArrayFinishBuildingDefault(&temp, error);
+ // Finish building to flush internal pointers but defer validation to
+ // ArrowIpcDecoderValidateArray()
+ result = ArrowArrayFinishBuilding(&temp, NANOARROW_VALIDATION_LEVEL_NONE,
error);
if (result != NANOARROW_OK) {
temp.release(&temp);
return result;
@@ -1406,3 +1405,9 @@ ArrowErrorCode
ArrowIpcDecoderDecodeArrayFromShared(struct ArrowIpcDecoder* deco
return ArrowIpcDecoderDecodeArrayInternal(
decoder, ArrowIpcBufferFactoryFromShared(body), i, out, error);
}
+
+ArrowErrorCode ArrowIpcDecoderValidateArray(struct ArrowArray* decoded,
+ enum ArrowValidationLevel
validation_level,
+ struct ArrowError* error) {
+ return ArrowArrayFinishBuilding(decoded, validation_level, error);
+}
diff --git
a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
index d125bf4..824a339 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
@@ -295,6 +295,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeSimpleRecordBatch)
{
// Check full struct extract
EXPECT_EQ(ArrowIpcDecoderDecodeArray(&decoder, body, -1, &array, nullptr),
NANOARROW_OK);
+ EXPECT_EQ(
+ ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL,
nullptr),
+ NANOARROW_OK);
EXPECT_EQ(array.length, 3);
EXPECT_EQ(array.null_count, 0);
ASSERT_EQ(array.n_children, 1);
@@ -309,6 +312,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeSimpleRecordBatch)
{
// Check field extract
EXPECT_EQ(ArrowIpcDecoderDecodeArray(&decoder, body, 0, &array, nullptr),
NANOARROW_OK);
+ EXPECT_EQ(
+ ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL,
nullptr),
+ NANOARROW_OK);
ASSERT_EQ(array.n_buffers, 2);
ASSERT_EQ(array.length, 3);
EXPECT_EQ(array.null_count, 0);
@@ -478,6 +484,9 @@ TEST(NanoarrowIpcTest,
NanoarrowIpcDecodeSimpleRecordBatchFromShared) {
// Check full struct extract
EXPECT_EQ(ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, -1,
&array, nullptr),
NANOARROW_OK);
+ EXPECT_EQ(
+ ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL,
nullptr),
+ NANOARROW_OK);
EXPECT_EQ(array.length, 3);
EXPECT_EQ(array.null_count, 0);
@@ -494,6 +503,10 @@ TEST(NanoarrowIpcTest,
NanoarrowIpcDecodeSimpleRecordBatchFromShared) {
// Check field extract
EXPECT_EQ(ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, 0, &array,
nullptr),
NANOARROW_OK);
+ EXPECT_EQ(
+ ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL,
nullptr),
+ NANOARROW_OK);
+
// Release the original shared (forthcoming array buffers should still be
valid)
ArrowIpcSharedBufferReset(&shared);
@@ -543,8 +556,12 @@ TEST(NanoarrowIpcTest,
NanoarrowIpcSharedBufferThreadSafeDecode) {
struct ArrowArray arrays[10];
for (int i = 0; i < 10; i++) {
- ASSERT_EQ(ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, -1,
arrays + i, nullptr),
- NANOARROW_OK);
+ ASSERT_EQ(
+ ArrowIpcDecoderDecodeArrayFromShared(&decoder, &shared, -1, arrays +
i, nullptr),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowIpcDecoderValidateArray(arrays + i,
NANOARROW_VALIDATION_LEVEL_FULL,
+ nullptr),
+ NANOARROW_OK);
}
// Clean up
@@ -556,7 +573,8 @@ TEST(NanoarrowIpcTest,
NanoarrowIpcSharedBufferThreadSafeDecode) {
std::thread threads[10];
for (int i = 0; i < 10; i++) {
threads[i] = std::thread([&arrays, i, &one_two_three_le] {
- memcmp(arrays[i].children[0]->buffers[1], one_two_three_le,
sizeof(one_two_three_le));
+ memcmp(arrays[i].children[0]->buffers[1], one_two_three_le,
+ sizeof(one_two_three_le));
arrays[i].release(arrays + i);
});
}
@@ -605,6 +623,9 @@ TEST_P(ArrowTypeParameterizedTestFixture,
NanoarrowIpcArrowArrayRoundtrip) {
buffer_view.size_bytes -= decoder.header_size_bytes;
ASSERT_EQ(ArrowIpcDecoderDecodeArray(&decoder, buffer_view, -1, &array,
nullptr),
NANOARROW_OK);
+ ASSERT_EQ(
+ ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL,
nullptr),
+ NANOARROW_OK);
auto maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema);
ASSERT_TRUE(maybe_batch.ok());
@@ -622,6 +643,9 @@ TEST_P(ArrowTypeParameterizedTestFixture,
NanoarrowIpcArrowArrayRoundtrip) {
buffer_view.size_bytes -= decoder.header_size_bytes;
ASSERT_EQ(ArrowIpcDecoderDecodeArray(&decoder, buffer_view, -1, &array,
nullptr),
NANOARROW_OK);
+ ASSERT_EQ(
+ ArrowIpcDecoderValidateArray(&array, NANOARROW_VALIDATION_LEVEL_FULL,
nullptr),
+ NANOARROW_OK);
maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema);
ASSERT_TRUE(maybe_batch.ok());
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
index 1b2ad87..0e457c7 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c
@@ -365,11 +365,13 @@ static int ArrowIpcArrayStreamReaderGetNext(struct
ArrowArrayStream* stream,
// Read in the body
NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextBody(private_data));
+ struct ArrowArray tmp;
+
if (private_data->use_shared_buffers) {
struct ArrowIpcSharedBuffer shared;
NANOARROW_RETURN_NOT_OK(ArrowIpcSharedBufferInit(&shared,
&private_data->body));
NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayFromShared(
- &private_data->decoder, &shared, private_data->field_index, out,
+ &private_data->decoder, &shared, private_data->field_index, &tmp,
&private_data->error));
ArrowIpcSharedBufferReset(&shared);
} else {
@@ -378,10 +380,18 @@ static int ArrowIpcArrayStreamReaderGetNext(struct
ArrowArrayStream* stream,
body_view.size_bytes = private_data->body.size_bytes;
NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArray(&private_data->decoder,
body_view,
-
private_data->field_index, out,
+
private_data->field_index, &tmp,
&private_data->error));
}
+ result = ArrowIpcDecoderValidateArray(&tmp, NANOARROW_VALIDATION_LEVEL_FULL,
+ &private_data->error);
+ if (result != NANOARROW_OK) {
+ tmp.release(&tmp);
+ return result;
+ }
+
+ ArrowArrayMove(&tmp, out);
return NANOARROW_OK;
}
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index fc88fd6..30422d2 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -930,11 +930,15 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct
ArrowArrayView* array_view,
if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION ||
array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) {
- // Check that we have valid type ids
- if (array_view->union_type_id_map == NULL ||
-
_ArrowParsedUnionTypeIdsWillEqualChildIndices(array_view->union_type_id_map,
- array_view->n_children,
- array_view->n_children))
{
+ // Check that we have valid type ids.
+ if (array_view->union_type_id_map == NULL) {
+ // If the union_type_id map is NULL
+ // (e.g., when using ArrowArrayInitFromType() +
ArrowArrayAllocateChildren()
+ // + ArrowArrayFinishBuilding()), we don't have enough information to
validate
+ // this buffer (GH-178).
+ } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices(
+ array_view->union_type_id_map, array_view->n_children,
+ array_view->n_children)) {
NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(array_view->buffer_views[0], 0,
array_view->n_children - 1,
error));
} else {
@@ -944,7 +948,8 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct
ArrowArrayView* array_view,
}
}
- if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
+ if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION &&
+ array_view->union_type_id_map != NULL) {
// Check that offsets refer to child elements that actually exist
for (int64_t i = 0; i < array_view->array->length; i++) {
int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i);