This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a431ba5  chore(extensions/nanoarrow_ipc): Test files from the 
arrow-testing repo (#176)
a431ba5 is described below

commit a431ba56b417fda652b6811d8d666bc8e1111276
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Mar 30 12:58:44 2023 -0400

    chore(extensions/nanoarrow_ipc): Test files from the arrow-testing repo 
(#176)
    
    This PR adds file tests for the integration files in
    
https://github.com/apache/arrow-testing/tree/master/data/arrow-ipc-stream/integration
    (and fixed a few bugs identified as a result of reading them). The added
    tests ensure that the data is read and that the data read is identical
    to what Arrow C++ would read as well (or that files fail with a
    particular error code and message).
    
    I also tested the fuzzed files and didn't get any crashes; however I'd
    like to spend more time on that and perhaps generate some more targeted
    fuzzed files (for example, none of the fuzzed files tripped a failed
    flatbuffer verification or a ValidateFull failure).
    
    The Arrow IPC file tests run on CI, as a part of release verification,
    and locally if you set the NANOARROW_ARROW_TESTING_DIR environment
    variable to your favourite arrow-testing checkout.
---
 .github/workflows/build-and-test-ipc.yaml          |   8 +
 .github/workflows/verify.yaml                      |   1 +
 dev/release/verify-release-candidate.sh            |  16 ++
 extensions/nanoarrow_ipc/CMakeLists.txt            |   5 +
 .../src/nanoarrow/nanoarrow_ipc_decoder.c          |  24 +-
 .../src/nanoarrow/nanoarrow_ipc_decoder_test.cc    |   1 -
 .../src/nanoarrow/nanoarrow_ipc_files_test.cc      | 271 +++++++++++++++++++++
 src/nanoarrow/array.c                              |   4 -
 8 files changed, 324 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-and-test-ipc.yaml 
b/.github/workflows/build-and-test-ipc.yaml
index 62aea46..cbb23ba 100644
--- a/.github/workflows/build-and-test-ipc.yaml
+++ b/.github/workflows/build-and-test-ipc.yaml
@@ -48,6 +48,7 @@ jobs:
 
     env:
       SUBDIR: 'extensions/nanoarrow_ipc'
+      NANOARROW_ARROW_TESTING_DIR: '${{ github.workspace }}/arrow-testing'
 
     steps:
       - name: Checkout repo
@@ -55,6 +56,13 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Checkout arrow-testing
+        uses: actions/checkout@v3
+        with:
+          repository: apache/arrow-testing
+          fetch-depth: 0
+          path: arrow-testing
+
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/verify.yaml b/.github/workflows/verify.yaml
index 2cad7ca..68e08d3 100644
--- a/.github/workflows/verify.yaml
+++ b/.github/workflows/verify.yaml
@@ -145,6 +145,7 @@ jobs:
           - {platform: "fedora", arch: "amd64"}
           - {platform: "archlinux", arch: "amd64"}
           - {platform: "alpine", arch: "amd64"}
+          - {platform: "centos7", arch: "amd64"}
           - {
               platform: "ubuntu",
               arch: "amd64",
diff --git a/dev/release/verify-release-candidate.sh 
b/dev/release/verify-release-candidate.sh
index 5019094..6617061 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -27,6 +27,10 @@
 # - CTEST_BIN: Command to use for ctest (e.g., ctest3 on Centos7)
 # - NANOARROW_CMAKE_OPTIONS (e.g., to help cmake find Arrow C++)
 # - R_HOME: Path to the desired R installation. Defaults to `R` on PATH.
+# - NANOARROW_ARROW_TESTING_DIR: A path to a checkout of apache/arrow-testing.
+#   If unset, the script will check out a version into NANOARROW_TMPDIR.
+# - NANOARROW_TMPDIR: Use to specify a persistent directory such that 
verification
+#   results are more easily retrieved.
 # - TEST_SOURCE: Set to 0 to selectively run component verification.
 # - TEST_C: Builds C libraries and tests using the default CMake
 #   configuration. Defaults to the value of TEST_SOURCE.
@@ -177,6 +181,17 @@ setup_tempdir() {
   echo "Working in sandbox ${NANOARROW_TMPDIR}"
 }
 
+setup_arrow_testing() {
+  show_header "Setting up arrow-testing"
+
+  if [ -z "${NANOARROW_ARROW_TESTING_DIR}" ]; then
+    export NANOARROW_ARROW_TESTING_DIR="${NANOARROW_TMPDIR}/arrow-testing"
+    git clone --depth=1 https://github.com/apache/arrow-testing 
${NANOARROW_ARROW_TESTING_DIR}
+  fi
+
+  echo "Using arrow-testing at '${NANOARROW_ARROW_TESTING_DIR}'"
+}
+
 # Usage: test_cmake_project build-dir src-dir extra-config-arg1 
extra-config-arg...
 test_cmake_project() {
   if [ -z "${CMAKE_BIN}" ]; then
@@ -329,6 +344,7 @@ test_source_distribution() {
 TEST_SUCCESS=no
 
 setup_tempdir
+setup_arrow_testing
 ensure_source_directory
 test_source_distribution
 
diff --git a/extensions/nanoarrow_ipc/CMakeLists.txt 
b/extensions/nanoarrow_ipc/CMakeLists.txt
index 1d3b16a..33640f1 100644
--- a/extensions/nanoarrow_ipc/CMakeLists.txt
+++ b/extensions/nanoarrow_ipc/CMakeLists.txt
@@ -211,6 +211,7 @@ if (NANOARROW_IPC_BUILD_TESTS)
 
   add_executable(nanoarrow_ipc_decoder_test 
src/nanoarrow/nanoarrow_ipc_decoder_test.cc)
   add_executable(nanoarrow_ipc_reader_test 
src/nanoarrow/nanoarrow_ipc_reader_test.cc)
+  add_executable(nanoarrow_ipc_files_test 
src/nanoarrow/nanoarrow_ipc_files_test.cc)
 
   if(NANOARROW_IPC_CODE_COVERAGE)
     target_compile_options(ipc_coverage_config INTERFACE -O0 -g --coverage)
@@ -232,10 +233,14 @@ if (NANOARROW_IPC_BUILD_TESTS)
   target_link_libraries(
     nanoarrow_ipc_reader_test
     nanoarrow_ipc nanoarrow gtest_main)
+  target_link_libraries(
+    nanoarrow_ipc_files_test
+    nanoarrow_ipc nanoarrow ${NANOARROW_IPC_ARROW_TARGET} gtest_main)
 
   include(GoogleTest)
   gtest_discover_tests(nanoarrow_ipc_decoder_test)
   gtest_discover_tests(nanoarrow_ipc_reader_test)
+  gtest_discover_tests(nanoarrow_ipc_files_test)
 endif()
 
 if (NANOARROW_IPC_BUILD_APPS)
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
index 639422e..1d78e96 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
@@ -20,15 +20,25 @@
 #include <string.h>
 
 // For thread safe shared buffers we need C11 + stdatomic.h
+// Can compile with -DNANOARROW_IPC_USE_STDATOMIC=0 or 1 to override
+// automatic detection
 #if !defined(NANOARROW_IPC_USE_STDATOMIC)
 #define NANOARROW_IPC_USE_STDATOMIC 0
+
+// Check for C11
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+
+// Check for GCC 4.8, which doesn't include stdatomic.h but does
+// not define __STDC_NO_ATOMICS__
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ >= 5
+
 #if !defined(__STDC_NO_ATOMICS__)
 #include <stdatomic.h>
 #undef NANOARROW_IPC_USE_STDATOMIC
 #define NANOARROW_IPC_USE_STDATOMIC 1
 #endif
 #endif
+#endif
 
 #endif
 
@@ -126,6 +136,11 @@ static void ArrowIpcSharedBufferFree(struct 
ArrowBufferAllocator* allocator, uin
 
 ArrowErrorCode ArrowIpcSharedBufferInit(struct ArrowIpcSharedBuffer* shared,
                                         struct ArrowBuffer* src) {
+  if (src->data == NULL) {
+    ArrowBufferMove(src, &shared->private_src);
+    return NANOARROW_OK;
+  }
+
   struct ArrowIpcSharedBufferPrivate* private_data =
       (struct ArrowIpcSharedBufferPrivate*)ArrowMalloc(
           sizeof(struct ArrowIpcSharedBufferPrivate));
@@ -149,6 +164,13 @@ ArrowErrorCode ArrowIpcSharedBufferInit(struct 
ArrowIpcSharedBuffer* shared,
 
 static void ArrowIpcSharedBufferClone(struct ArrowIpcSharedBuffer* shared,
                                       struct ArrowBuffer* shared_out) {
+  if (shared->private_src.data == NULL) {
+    ArrowBufferInit(shared_out);
+    shared_out->size_bytes = shared_out->size_bytes;
+    shared_out->capacity_bytes = shared_out->capacity_bytes;
+    return;
+  }
+
   struct ArrowIpcSharedBufferPrivate* private_data =
       (struct 
ArrowIpcSharedBufferPrivate*)shared->private_src.allocator.private_data;
   ArrowIpcSharedBufferUpdate(private_data, 1);
@@ -706,7 +728,7 @@ static int ArrowIpcDecoderSetField(struct ArrowSchema* 
schema, ns(Field_table_t)
                                    struct ArrowError* error) {
   // No dictionary support yet
   if (ns(Field_dictionary_is_present(field))) {
-    ArrowErrorSet(error, "Field DictionaryEncoding not supported");
+    ArrowErrorSet(error, "Schema message field with DictionaryEncoding not 
supported");
     return ENOTSUP;
   }
 
diff --git 
a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
index 824a339..ad989fc 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder_test.cc
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <stdexcept>
 #include <thread>
 
 #include <arrow/array.h>
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
new file mode 100644
index 0000000..6c6081d
--- /dev/null
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+#include <fstream>
+#include <sstream>
+
+#include <arrow/buffer.h>
+#include <arrow/c/bridge.h>
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+#include <arrow/table.h>
+#include <gtest/gtest.h>
+
+#include "nanoarrow.hpp"
+#include "nanoarrow_ipc.h"
+
+#include "flatcc/portable/pendian_detect.h"
+
+using namespace arrow;
+
+// Utility to test an IPC stream written a as a file (where path does not 
include a
+// prefix that might be specific where a specific system has arrow-testing 
checked out).
+// This helper also checks a read that is supposed to be valid against what 
Arrow C++
+// would read.
+class TestFile {
+ public:
+  TestFile(std::string path, int expected_return_code, std::string 
expected_error_message)
+      : path_(path),
+        expected_return_code_(expected_return_code),
+        expected_error_message_(expected_error_message) {}
+
+  TestFile(std::string path) : TestFile(path, NANOARROW_OK, "") {}
+
+  TestFile() : TestFile("") {}
+
+  static TestFile OK(std::string path) { return TestFile(path); }
+
+  static TestFile Err(int code, std::string path, std::string message = 
"__any__") {
+    return TestFile(path, code, message);
+  }
+
+  static TestFile ErrorAny(std::string path) {
+    return Err(std::numeric_limits<int>::max(), path);
+  }
+
+  static TestFile Invalid(std::string path, std::string message = "__any__") {
+    return Err(EINVAL, path, message);
+  }
+
+  static TestFile NotSupported(std::string path, std::string message = 
"__any__") {
+    return Err(ENOTSUP, path, message);
+  }
+
+  static TestFile NoData(std::string path, std::string message = "__any__") {
+    return Err(ENODATA, path, message);
+  }
+
+  void Test(std::string dir_prefix) {
+    std::stringstream path_builder;
+    path_builder << dir_prefix << "/" << path_;
+
+    // Read the whole file into an ArrowBuffer. We need the whole thing in 
memory
+    // to avoid requiring Arrow C++ with filesystem.
+    std::ifstream infile(path_builder.str(), std::ios::in | std::ios::binary);
+    nanoarrow::UniqueBuffer content;
+    do {
+      content->size_bytes += infile.gcount();
+      ASSERT_EQ(ArrowBufferReserve(content.get(), 8096), NANOARROW_OK);
+    } while (
+        infile.read(reinterpret_cast<char*>(content->data + 
content->size_bytes), 8096));
+    content->size_bytes += infile.gcount();
+
+    // Make a copy into another buffer so we can wrap it in something Arrow C++
+    // understands
+    nanoarrow::UniqueBuffer content_copy;
+    ASSERT_EQ(ArrowBufferAppend(content_copy.get(), content->data, 
content->size_bytes),
+              NANOARROW_OK);
+
+    struct ArrowIpcInputStream input;
+    nanoarrow::UniqueArrayStream stream;
+    ASSERT_EQ(ArrowIpcInputStreamInitBuffer(&input, content.get()), 
NANOARROW_OK);
+    ASSERT_EQ(ArrowIpcArrayStreamReaderInit(stream.get(), &input, nullptr), 
NANOARROW_OK);
+
+    nanoarrow::UniqueSchema schema;
+    int result = stream->get_schema(stream.get(), schema.get());
+    if (result != NANOARROW_OK) {
+      std::string err(stream->get_last_error(stream.get()));
+      if (Check(result, err)) {
+        return;
+      }
+
+      GTEST_FAIL() << MakeError(result, err);
+    }
+
+    std::vector<nanoarrow::UniqueArray> arrays;
+    while (true) {
+      nanoarrow::UniqueArray array;
+
+      result = stream->get_next(stream.get(), array.get());
+      if (result != NANOARROW_OK) {
+        std::string err(stream->get_last_error(stream.get()));
+        if (Check(result, err)) {
+          return;
+        }
+
+        GTEST_FAIL() << MakeError(result, err);
+      }
+
+      if (array->release == nullptr) {
+        break;
+      }
+
+      arrays.push_back(std::move(array));
+    }
+
+    // If the file was supposed to fail the read but did not, fail here
+    if (expected_return_code_ != NANOARROW_OK) {
+      GTEST_FAIL() << MakeError(NANOARROW_OK, "");
+    }
+
+    // Read the same data with Arrow C++.
+    auto content_copy_wrapped =
+        Buffer::Wrap<uint8_t>(content_copy->data, content_copy->size_bytes);
+    auto buffer_reader = 
std::make_shared<io::BufferReader>(content_copy_wrapped);
+
+    // Support Arrow 9.0.0 for Fedora and Centos7 images
+#if ARROW_VERSION_MAJOR >= 10
+    auto maybe_input_stream =
+        io::RandomAccessFile::GetStream(buffer_reader, 0, 
content_copy_wrapped->size());
+    if (!maybe_input_stream.ok()) {
+      GTEST_FAIL() << maybe_input_stream.status().message();
+    }
+
+    std::shared_ptr<io::InputStream> input_stream = 
maybe_input_stream.ValueUnsafe();
+#else
+    std::shared_ptr<io::InputStream> input_stream =
+        io::RandomAccessFile::GetStream(buffer_reader, 0, 
content_copy_wrapped->size());
+#endif
+
+    auto maybe_reader =
+        ipc::RecordBatchStreamReader::Open(input_stream);
+    if (!maybe_reader.ok()) {
+      GTEST_FAIL() << maybe_reader.status().message();
+    }
+
+    auto maybe_table_arrow = maybe_reader.ValueUnsafe()->ToTable();
+    if (!maybe_table_arrow.ok()) {
+      GTEST_FAIL() << maybe_table_arrow.status().message();
+    }
+
+    // Make a Table from the our vector of arrays
+    auto maybe_schema = ImportSchema(schema.get());
+    if (!maybe_schema.ok()) {
+      GTEST_FAIL() << maybe_schema.status().message();
+    }
+
+    
ASSERT_TRUE(maybe_table_arrow.ValueUnsafe()->schema()->Equals(**maybe_schema, 
true));
+
+    std::vector<std::shared_ptr<RecordBatch>> batches;
+    for (auto& array : arrays) {
+      auto maybe_batch = ImportRecordBatch(array.get(), *maybe_schema);
+      batches.push_back(std::move(*maybe_batch));
+    }
+
+    auto maybe_table = Table::FromRecordBatches(*maybe_schema, batches);
+    EXPECT_TRUE(maybe_table.ValueUnsafe()->Equals(**maybe_table_arrow, true));
+  }
+
+  bool Check(int result, std::string msg) {
+    return (expected_return_code_ == std::numeric_limits<int>::max() &&
+            result != NANOARROW_OK) ||
+           (result == expected_return_code_ && msg == expected_error_message_) 
||
+           (result == expected_return_code_ && expected_error_message_ == 
"__any__");
+  }
+
+  std::string MakeError(int result, std::string msg) {
+    std::stringstream err;
+    err << "Expected file '" << path_ << "' to return code " << 
expected_return_code_
+        << " and error message '" << expected_error_message_ << "' but got 
return code "
+        << result << " and error message '" << msg << "'";
+    return err.str();
+  }
+
+  std::string path_;
+  int expected_return_code_;
+  std::string expected_error_message_;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestFile& obj) {
+  os << obj.path_;
+  return os;
+}
+
+class ArrowTestingPathParameterizedTestFixture
+    : public ::testing::TestWithParam<TestFile> {
+ protected:
+  TestFile test_file;
+};
+
+TEST_P(ArrowTestingPathParameterizedTestFixture, 
NanoarrowIpcTestFileNativeEndian) {
+  const char* testing_dir = getenv("NANOARROW_ARROW_TESTING_DIR");
+  if (testing_dir == nullptr || strlen(testing_dir) == 0) {
+    GTEST_SKIP() << "NANOARROW_ARROW_TESTING_DIR environment variable not set";
+  }
+
+  std::stringstream dir_builder;
+
+#if defined(__BIG_ENDIAN__)
+  dir_builder << testing_dir << 
"/data/arrow-ipc-stream/integration/1.0.0-bigendian";
+#else
+  dir_builder << testing_dir << 
"/data/arrow-ipc-stream/integration/1.0.0-littleendian";
+#endif
+  TestFile param = GetParam();
+  param.Test(dir_builder.str());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    NanoarrowIpcTest, ArrowTestingPathParameterizedTestFixture,
+    ::testing::Values(
+        // Files in data/arrow-ipc-stream/integration/1.0.0-(little|big)endian/
+        // should read without error and the data should match Arrow C++'s read
+        TestFile::OK("generated_custom_metadata.stream"),
+        TestFile::OK("generated_datetime.stream"),
+        TestFile::OK("generated_decimal.stream"),
+        TestFile::OK("generated_decimal256.stream"),
+
+        TestFile::OK("generated_duplicate_fieldnames.stream"),
+        TestFile::OK("generated_interval.stream"),
+        TestFile::OK("generated_map_non_canonical.stream"),
+        TestFile::OK("generated_map.stream"),
+        TestFile::OK("generated_nested_large_offsets.stream"),
+        TestFile::OK("generated_nested.stream"),
+        TestFile::OK("generated_null_trivial.stream"),
+        TestFile::OK("generated_null.stream"),
+        TestFile::OK("generated_primitive_large_offsets.stream"),
+        TestFile::OK("generated_primitive_no_batches.stream"),
+        TestFile::OK("generated_primitive_zerolength.stream"),
+        TestFile::OK("generated_primitive.stream"),
+        TestFile::OK("generated_recursive_nested.stream"),
+        TestFile::OK("generated_union.stream"),
+
+        // Files with features that are not yet supported (Dictionary encoding)
+        TestFile::NotSupported(
+            "generated_dictionary_unsigned.stream",
+            "Schema message field with DictionaryEncoding not supported"),
+        TestFile::NotSupported(
+            "generated_dictionary.stream",
+            "Schema message field with DictionaryEncoding not supported"),
+        TestFile::NotSupported(
+            "generated_nested_dictionary.stream",
+            "Schema message field with DictionaryEncoding not supported"),
+        TestFile::NotSupported(
+            "generated_extension.stream",
+            "Schema message field with DictionaryEncoding not supported")
+        // Comment to keep last line from wrapping
+        ));
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 30422d2..ae1db4d 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -910,10 +910,6 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct 
ArrowArrayView* array_view,
                                           struct ArrowError* error) {
   for (int i = 0; i < 3; i++) {
     switch (array_view->layout.buffer_type[i]) {
-      case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
-        NANOARROW_RETURN_NOT_OK(
-            ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
-        break;
       case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
         if (array_view->layout.element_size_bits[i] == 32) {
           NANOARROW_RETURN_NOT_OK(

Reply via email to