This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new a431ba5 chore(extensions/nanoarrow_ipc): Test files from the
arrow-testing repo (#176)
a431ba5 is described below
commit a431ba56b417fda652b6811d8d666bc8e1111276
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Mar 30 12:58:44 2023 -0400
chore(extensions/nanoarrow_ipc): Test files from the arrow-testing repo
(#176)
This PR adds file tests for the integration files in
https://github.com/apache/arrow-testing/tree/master/data/arrow-ipc-stream/integration
(and fixed a few bugs identified as a result of reading them). The added
tests ensure that the data is read and that the data read is identical
to what Arrow C++ would read as well (or that files fail with a
particular error code and message).
I also tested the fuzzed files and didn't get any crashes; however I'd
like to spend more time on that and perhaps generate some more targeted
fuzzed files (for example, none of the fuzzed files tripped a failed
flatbuffer verification or a ValidateFull failure).
The Arrow IPC file tests run on CI, as a part of release verification,
and locally if you set the NANOARROW_ARROW_TESTING_DIR environment
variable to your favourite arrow-testing checkout.
---
.github/workflows/build-and-test-ipc.yaml | 8 +
.github/workflows/verify.yaml | 1 +
dev/release/verify-release-candidate.sh | 16 ++
extensions/nanoarrow_ipc/CMakeLists.txt | 5 +
.../src/nanoarrow/nanoarrow_ipc_decoder.c | 24 +-
.../src/nanoarrow/nanoarrow_ipc_decoder_test.cc | 1 -
.../src/nanoarrow/nanoarrow_ipc_files_test.cc | 271 +++++++++++++++++++++
src/nanoarrow/array.c | 4 -
8 files changed, 324 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/build-and-test-ipc.yaml
b/.github/workflows/build-and-test-ipc.yaml
index 62aea46..cbb23ba 100644
--- a/.github/workflows/build-and-test-ipc.yaml
+++ b/.github/workflows/build-and-test-ipc.yaml
@@ -48,6 +48,7 @@ jobs:
env:
SUBDIR: 'extensions/nanoarrow_ipc'
+ NANOARROW_ARROW_TESTING_DIR: '${{ github.workspace }}/arrow-testing'
steps:
- name: Checkout repo
@@ -55,6 +56,13 @@ jobs:
with:
fetch-depth: 0
+ - name: Checkout arrow-testing
+ uses: actions/checkout@v3
+ with:
+ repository: apache/arrow-testing
+ fetch-depth: 0
+ path: arrow-testing
+
- name: Install dependencies
run: |
sudo apt-get update
diff --git a/.github/workflows/verify.yaml b/.github/workflows/verify.yaml
index 2cad7ca..68e08d3 100644
--- a/.github/workflows/verify.yaml
+++ b/.github/workflows/verify.yaml
@@ -145,6 +145,7 @@ jobs:
- {platform: "fedora", arch: "amd64"}
- {platform: "archlinux", arch: "amd64"}
- {platform: "alpine", arch: "amd64"}
+ - {platform: "centos7", arch: "amd64"}
- {
platform: "ubuntu",
arch: "amd64",
diff --git a/dev/release/verify-release-candidate.sh
b/dev/release/verify-release-candidate.sh
index 5019094..6617061 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -27,6 +27,10 @@
# - CTEST_BIN: Command to use for ctest (e.g., ctest3 on Centos7)
# - NANOARROW_CMAKE_OPTIONS (e.g., to help cmake find Arrow C++)
# - R_HOME: Path to the desired R installation. Defaults to `R` on PATH.
+# - NANOARROW_ARROW_TESTING_DIR: A path to a checkout of apache/arrow-testing.
+# If unset, the script will check out a version into NANOARROW_TMPDIR.
+# - NANOARROW_TMPDIR: Use to specify a persistent directory such that
verification
+# results are more easily retrieved.
# - TEST_SOURCE: Set to 0 to selectively run component verification.
# - TEST_C: Builds C libraries and tests using the default CMake
# configuration. Defaults to the value of TEST_SOURCE.
@@ -177,6 +181,17 @@ setup_tempdir() {
echo "Working in sandbox ${NANOARROW_TMPDIR}"
}
+setup_arrow_testing() {
+ show_header "Setting up arrow-testing"
+
+ if [ -z "${NANOARROW_ARROW_TESTING_DIR}" ]; then
+ export NANOARROW_ARROW_TESTING_DIR="${NANOARROW_TMPDIR}/arrow-testing"
+ git clone --depth=1 https://github.com/apache/arrow-testing
${NANOARROW_ARROW_TESTING_DIR}
+ fi
+
+ echo "Using arrow-testing at '${NANOARROW_ARROW_TESTING_DIR}'"
+}
+
# Usage: test_cmake_project build-dir src-dir extra-config-arg1
extra-config-arg...
test_cmake_project() {
if [ -z "${CMAKE_BIN}" ]; then
@@ -329,6 +344,7 @@ test_source_distribution() {
TEST_SUCCESS=no
setup_tempdir
+setup_arrow_testing
ensure_source_directory
test_source_distribution
diff --git a/extensions/nanoarrow_ipc/CMakeLists.txt
b/extensions/nanoarrow_ipc/CMakeLists.txt
index 1d3b16a..33640f1 100644
--- a/extensions/nanoarrow_ipc/CMakeLists.txt
+++ b/extensions/nanoarrow_ipc/CMakeLists.txt
@@ -211,6 +211,7 @@ if (NANOARROW_IPC_BUILD_TESTS)
add_executable(nanoarrow_ipc_decoder_test
src/nanoarrow/nanoarrow_ipc_decoder_test.cc)
add_executable(nanoarrow_ipc_reader_test
src/nanoarrow/nanoarrow_ipc_reader_test.cc)
+ add_executable(nanoarrow_ipc_files_test
src/nanoarrow/nanoarrow_ipc_files_test.cc)
if(NANOARROW_IPC_CODE_COVERAGE)
target_compile_options(ipc_coverage_config INTERFACE -O0 -g --coverage)
@@ -232,10 +233,14 @@ if (NANOARROW_IPC_BUILD_TESTS)
target_link_libraries(
nanoarrow_ipc_reader_test
nanoarrow_ipc nanoarrow gtest_main)
+ target_link_libraries(
+ nanoarrow_ipc_files_test
+ nanoarrow_ipc nanoarrow ${NANOARROW_IPC_ARROW_TARGET} gtest_main)
include(GoogleTest)
gtest_discover_tests(nanoarrow_ipc_decoder_test)
gtest_discover_tests(nanoarrow_ipc_reader_test)
+ gtest_discover_tests(nanoarrow_ipc_files_test)
endif()
if (NANOARROW_IPC_BUILD_APPS)
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
index 639422e..1d78e96 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
@@ -20,15 +20,25 @@
#include <string.h>
// For thread safe shared buffers we need C11 + stdatomic.h
+// Can compile with -DNANOARROW_IPC_USE_STDATOMIC=0 or 1 to override
+// automatic detection
#if !defined(NANOARROW_IPC_USE_STDATOMIC)
#define NANOARROW_IPC_USE_STDATOMIC 0
+
+// Check for C11
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+
+// Check for GCC 4.8, which doesn't include stdatomic.h but does
+// not define __STDC_NO_ATOMICS__
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ >= 5
+
#if !defined(__STDC_NO_ATOMICS__)
#include <stdatomic.h>
#undef NANOARROW_IPC_USE_STDATOMIC
#define NANOARROW_IPC_USE_STDATOMIC 1
#endif
#endif
+#endif
#endif
@@ -126,6 +136,11 @@ static void ArrowIpcSharedBufferFree(struct
ArrowBufferAllocator* allocator, uin
ArrowErrorCode ArrowIpcSharedBufferInit(struct ArrowIpcSharedBuffer* shared,
struct ArrowBuffer* src) {
+ if (src->data == NULL) {
+ ArrowBufferMove(src, &shared->private_src);
+ return NANOARROW_OK;
+ }
+
struct ArrowIpcSharedBufferPrivate* private_data =
(struct ArrowIpcSharedBufferPrivate*)ArrowMalloc(
sizeof(struct ArrowIpcSharedBufferPrivate));
@@ -149,6 +164,13 @@ ArrowErrorCode ArrowIpcSharedBufferInit(struct
ArrowIpcSharedBuffer* shared,
static void ArrowIpcSharedBufferClone(struct ArrowIpcSharedBuffer* shared,
struct ArrowBuffer* shared_out) {
+ if (shared->private_src.data == NULL) {
+ ArrowBufferInit(shared_out);
+ shared_out->size_bytes = shared_out->size_bytes;
+ shared_out->capacity_bytes = shared_out->capacity_bytes;
+ return;
+ }
+
struct ArrowIpcSharedBufferPrivate* private_data =
(struct
ArrowIpcSharedBufferPrivate*)shared->private_src.allocator.private_data;
ArrowIpcSharedBufferUpdate(private_data, 1);
@@ -706,7 +728,7 @@ static int ArrowIpcDecoderSetField(struct ArrowSchema*
schema, ns(Field_table_t)
struct ArrowError* error) {
// No dictionary support yet
if (ns(Field_dictionary_is_present(field))) {
- ArrowErrorSet(error, "Field DictionaryEncoding not supported");
+ ArrowErrorSet(error, "Schema message field with DictionaryEncoding not
supported");
return ENOTSUP;
}
diff --git
a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
index 824a339..ad989fc 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
@@ -15,7 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-#include <stdexcept>
#include <thread>
#include <arrow/array.h>
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
new file mode 100644
index 0000000..6c6081d
--- /dev/null
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+#include <fstream>
+#include <sstream>
+
+#include <arrow/buffer.h>
+#include <arrow/c/bridge.h>
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+#include <arrow/table.h>
+#include <gtest/gtest.h>
+
+#include "nanoarrow.hpp"
+#include "nanoarrow_ipc.h"
+
+#include "flatcc/portable/pendian_detect.h"
+
+using namespace arrow;
+
+// Utility to test an IPC stream written a as a file (where path does not
include a
+// prefix that might be specific where a specific system has arrow-testing
checked out).
+// This helper also checks a read that is supposed to be valid against what
Arrow C++
+// would read.
+class TestFile {
+ public:
+ TestFile(std::string path, int expected_return_code, std::string
expected_error_message)
+ : path_(path),
+ expected_return_code_(expected_return_code),
+ expected_error_message_(expected_error_message) {}
+
+ TestFile(std::string path) : TestFile(path, NANOARROW_OK, "") {}
+
+ TestFile() : TestFile("") {}
+
+ static TestFile OK(std::string path) { return TestFile(path); }
+
+ static TestFile Err(int code, std::string path, std::string message =
"__any__") {
+ return TestFile(path, code, message);
+ }
+
+ static TestFile ErrorAny(std::string path) {
+ return Err(std::numeric_limits<int>::max(), path);
+ }
+
+ static TestFile Invalid(std::string path, std::string message = "__any__") {
+ return Err(EINVAL, path, message);
+ }
+
+ static TestFile NotSupported(std::string path, std::string message =
"__any__") {
+ return Err(ENOTSUP, path, message);
+ }
+
+ static TestFile NoData(std::string path, std::string message = "__any__") {
+ return Err(ENODATA, path, message);
+ }
+
+ void Test(std::string dir_prefix) {
+ std::stringstream path_builder;
+ path_builder << dir_prefix << "/" << path_;
+
+ // Read the whole file into an ArrowBuffer. We need the whole thing in
memory
+ // to avoid requiring Arrow C++ with filesystem.
+ std::ifstream infile(path_builder.str(), std::ios::in | std::ios::binary);
+ nanoarrow::UniqueBuffer content;
+ do {
+ content->size_bytes += infile.gcount();
+ ASSERT_EQ(ArrowBufferReserve(content.get(), 8096), NANOARROW_OK);
+ } while (
+ infile.read(reinterpret_cast<char*>(content->data +
content->size_bytes), 8096));
+ content->size_bytes += infile.gcount();
+
+ // Make a copy into another buffer so we can wrap it in something Arrow C++
+ // understands
+ nanoarrow::UniqueBuffer content_copy;
+ ASSERT_EQ(ArrowBufferAppend(content_copy.get(), content->data,
content->size_bytes),
+ NANOARROW_OK);
+
+ struct ArrowIpcInputStream input;
+ nanoarrow::UniqueArrayStream stream;
+ ASSERT_EQ(ArrowIpcInputStreamInitBuffer(&input, content.get()),
NANOARROW_OK);
+ ASSERT_EQ(ArrowIpcArrayStreamReaderInit(stream.get(), &input, nullptr),
NANOARROW_OK);
+
+ nanoarrow::UniqueSchema schema;
+ int result = stream->get_schema(stream.get(), schema.get());
+ if (result != NANOARROW_OK) {
+ std::string err(stream->get_last_error(stream.get()));
+ if (Check(result, err)) {
+ return;
+ }
+
+ GTEST_FAIL() << MakeError(result, err);
+ }
+
+ std::vector<nanoarrow::UniqueArray> arrays;
+ while (true) {
+ nanoarrow::UniqueArray array;
+
+ result = stream->get_next(stream.get(), array.get());
+ if (result != NANOARROW_OK) {
+ std::string err(stream->get_last_error(stream.get()));
+ if (Check(result, err)) {
+ return;
+ }
+
+ GTEST_FAIL() << MakeError(result, err);
+ }
+
+ if (array->release == nullptr) {
+ break;
+ }
+
+ arrays.push_back(std::move(array));
+ }
+
+ // If the file was supposed to fail the read but did not, fail here
+ if (expected_return_code_ != NANOARROW_OK) {
+ GTEST_FAIL() << MakeError(NANOARROW_OK, "");
+ }
+
+ // Read the same data with Arrow C++.
+ auto content_copy_wrapped =
+ Buffer::Wrap<uint8_t>(content_copy->data, content_copy->size_bytes);
+ auto buffer_reader =
std::make_shared<io::BufferReader>(content_copy_wrapped);
+
+ // Support Arrow 9.0.0 for Fedora and Centos7 images
+#if ARROW_VERSION_MAJOR >= 10
+ auto maybe_input_stream =
+ io::RandomAccessFile::GetStream(buffer_reader, 0,
content_copy_wrapped->size());
+ if (!maybe_input_stream.ok()) {
+ GTEST_FAIL() << maybe_input_stream.status().message();
+ }
+
+ std::shared_ptr<io::InputStream> input_stream =
maybe_input_stream.ValueUnsafe();
+#else
+ std::shared_ptr<io::InputStream> input_stream =
+ io::RandomAccessFile::GetStream(buffer_reader, 0,
content_copy_wrapped->size());
+#endif
+
+ auto maybe_reader =
+ ipc::RecordBatchStreamReader::Open(input_stream);
+ if (!maybe_reader.ok()) {
+ GTEST_FAIL() << maybe_reader.status().message();
+ }
+
+ auto maybe_table_arrow = maybe_reader.ValueUnsafe()->ToTable();
+ if (!maybe_table_arrow.ok()) {
+ GTEST_FAIL() << maybe_table_arrow.status().message();
+ }
+
+ // Make a Table from the our vector of arrays
+ auto maybe_schema = ImportSchema(schema.get());
+ if (!maybe_schema.ok()) {
+ GTEST_FAIL() << maybe_schema.status().message();
+ }
+
+
ASSERT_TRUE(maybe_table_arrow.ValueUnsafe()->schema()->Equals(**maybe_schema,
true));
+
+ std::vector<std::shared_ptr<RecordBatch>> batches;
+ for (auto& array : arrays) {
+ auto maybe_batch = ImportRecordBatch(array.get(), *maybe_schema);
+ batches.push_back(std::move(*maybe_batch));
+ }
+
+ auto maybe_table = Table::FromRecordBatches(*maybe_schema, batches);
+ EXPECT_TRUE(maybe_table.ValueUnsafe()->Equals(**maybe_table_arrow, true));
+ }
+
+ bool Check(int result, std::string msg) {
+ return (expected_return_code_ == std::numeric_limits<int>::max() &&
+ result != NANOARROW_OK) ||
+ (result == expected_return_code_ && msg == expected_error_message_)
||
+ (result == expected_return_code_ && expected_error_message_ ==
"__any__");
+ }
+
+ std::string MakeError(int result, std::string msg) {
+ std::stringstream err;
+ err << "Expected file '" << path_ << "' to return code " <<
expected_return_code_
+ << " and error message '" << expected_error_message_ << "' but got
return code "
+ << result << " and error message '" << msg << "'";
+ return err.str();
+ }
+
+ std::string path_;
+ int expected_return_code_;
+ std::string expected_error_message_;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestFile& obj) {
+ os << obj.path_;
+ return os;
+}
+
+class ArrowTestingPathParameterizedTestFixture
+ : public ::testing::TestWithParam<TestFile> {
+ protected:
+ TestFile test_file;
+};
+
+TEST_P(ArrowTestingPathParameterizedTestFixture,
NanoarrowIpcTestFileNativeEndian) {
+ const char* testing_dir = getenv("NANOARROW_ARROW_TESTING_DIR");
+ if (testing_dir == nullptr || strlen(testing_dir) == 0) {
+ GTEST_SKIP() << "NANOARROW_ARROW_TESTING_DIR environment variable not set";
+ }
+
+ std::stringstream dir_builder;
+
+#if defined(__BIG_ENDIAN__)
+ dir_builder << testing_dir <<
"/data/arrow-ipc-stream/integration/1.0.0-bigendian";
+#else
+ dir_builder << testing_dir <<
"/data/arrow-ipc-stream/integration/1.0.0-littleendian";
+#endif
+ TestFile param = GetParam();
+ param.Test(dir_builder.str());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NanoarrowIpcTest, ArrowTestingPathParameterizedTestFixture,
+ ::testing::Values(
+ // Files in data/arrow-ipc-stream/integration/1.0.0-(little|big)endian/
+ // should read without error and the data should match Arrow C++'s read
+ TestFile::OK("generated_custom_metadata.stream"),
+ TestFile::OK("generated_datetime.stream"),
+ TestFile::OK("generated_decimal.stream"),
+ TestFile::OK("generated_decimal256.stream"),
+
+ TestFile::OK("generated_duplicate_fieldnames.stream"),
+ TestFile::OK("generated_interval.stream"),
+ TestFile::OK("generated_map_non_canonical.stream"),
+ TestFile::OK("generated_map.stream"),
+ TestFile::OK("generated_nested_large_offsets.stream"),
+ TestFile::OK("generated_nested.stream"),
+ TestFile::OK("generated_null_trivial.stream"),
+ TestFile::OK("generated_null.stream"),
+ TestFile::OK("generated_primitive_large_offsets.stream"),
+ TestFile::OK("generated_primitive_no_batches.stream"),
+ TestFile::OK("generated_primitive_zerolength.stream"),
+ TestFile::OK("generated_primitive.stream"),
+ TestFile::OK("generated_recursive_nested.stream"),
+ TestFile::OK("generated_union.stream"),
+
+ // Files with features that are not yet supported (Dictionary encoding)
+ TestFile::NotSupported(
+ "generated_dictionary_unsigned.stream",
+ "Schema message field with DictionaryEncoding not supported"),
+ TestFile::NotSupported(
+ "generated_dictionary.stream",
+ "Schema message field with DictionaryEncoding not supported"),
+ TestFile::NotSupported(
+ "generated_nested_dictionary.stream",
+ "Schema message field with DictionaryEncoding not supported"),
+ TestFile::NotSupported(
+ "generated_extension.stream",
+ "Schema message field with DictionaryEncoding not supported")
+ // Comment to keep last line from wrapping
+ ));
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 30422d2..ae1db4d 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -910,10 +910,6 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct
ArrowArrayView* array_view,
struct ArrowError* error) {
for (int i = 0; i < 3; i++) {
switch (array_view->layout.buffer_type[i]) {
- case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
- NANOARROW_RETURN_NOT_OK(
- ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
- break;
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
if (array_view->layout.element_size_bits[i] == 32) {
NANOARROW_RETURN_NOT_OK(