(arrow-nanoarrow) branch main updated: feat: Add C Data integration test shared library (#337)

paleolimbot Tue, 19 Dec 2023 06:14:54 -0800

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 12be163  feat: Add C Data integration test shared library (#337)
12be163 is described below

commit 12be163269f3c1b27a09c58c58ba388acbd36b86
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Dec 19 10:14:12 2023 -0400

    feat: Add C Data integration test shared library (#337)
    
    This PR adds the shared library target required by the archery
    integration tetster, based on
    
https://github.com/apache/arrow/blob/main/cpp/src/arrow/integration/c_data_integration_internal.cc
    .
    
    I haven't tested this via archery because I have no idea how to do so
    (the implementation names and file locations seem hard-coded?); however,
    it does add a googletest file with some minimal examples to at least
    make sure everything is wired up.
    
    ---------
    
    Co-authored-by: Antoine Pitrou <[email protected]>
---
 CMakeLists.txt                                     |  36 ++--
 .../src/nanoarrow/nanoarrow_ipc_files_test.cc      |   4 +-
 src/nanoarrow/integration/c_data_integration.cc    | 224 +++++++++++++++++++++
 src/nanoarrow/integration/c_data_integration.h     |  90 +++++++++
 .../integration/c_data_integration_test.cc         | 160 +++++++++++++++
 src/nanoarrow/nanoarrow_testing.hpp                |  47 ++++-
 src/nanoarrow/nanoarrow_testing_test.cc            |   8 +-
 7 files changed, 551 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a48769f..1e44b6f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,8 @@ set(NANOARROW_VERSION_MINOR "${nanoarrow_VERSION_MINOR}")
 set(NANOARROW_VERSION_PATCH "${nanoarrow_VERSION_PATCH}")
 
 option(NANOARROW_BUILD_TESTS "Build tests" OFF)
+option(NANOARROW_BUILD_INTEGRATION_TESTS
+       "Build cross-implementation Arrow integration tests" OFF)
 option(NANOARROW_BUNDLE "Create bundled nanoarrow.h and nanoarrow.c" OFF)
 option(NANOARROW_BUNDLE_AS_CPP "Bundle nanoarrow source file as nanoarrow.cc" 
OFF)
 option(NANOARROW_NAMESPACE "A prefix for exported symbols" OFF)
@@ -151,10 +153,25 @@ else()
           DESTINATION include/nanoarrow)
 endif()
 
-if(NANOARROW_BUILD_TESTS)
-  # For testing we use GTest + Arrow C++
+# Always build integration test if building tests
+if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS)
   include(FetchContent)
 
+  fetchcontent_declare(nlohmann_json
+                       URL 
https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.zip
+                       URL_HASH 
SHA256=95651d7d1fcf2e5c3163c3d37df6d6b3e9e5027299e6bd050d157322ceda9ac9
+  )
+  fetchcontent_makeavailable(nlohmann_json)
+
+  add_library(nanoarrow_c_data_integration SHARED
+              src/nanoarrow/integration/c_data_integration.cc)
+  target_include_directories(nanoarrow_c_data_integration
+                             PUBLIC 
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+                                    $<INSTALL_INTERFACE:include>)
+  target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow 
nlohmann_json)
+endif()
+
+if(NANOARROW_BUILD_TESTS)
   set(MEMORYCHECK_COMMAND_OPTIONS
       "--leak-check=full 
--suppressions=${CMAKE_CURRENT_LIST_DIR}/valgrind.supp --error-exitcode=1"
   )
@@ -203,16 +220,6 @@ if(NANOARROW_BUILD_TESTS)
 
   fetchcontent_makeavailable(googletest)
 
-  # JSON library for integration testing
-  # Also used by some versions of Arrow, so check if this is already available
-  if(NOT TARGET nlohmann_json::nlohmann_json)
-    fetchcontent_declare(nlohmann_json
-                         URL 
https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.zip
-                         URL_HASH 
SHA256=95651d7d1fcf2e5c3163c3d37df6d6b3e9e5027299e6bd050d157322ceda9ac9
-    )
-    fetchcontent_makeavailable(nlohmann_json)
-  endif()
-
   add_executable(utils_test src/nanoarrow/utils_test.cc)
   add_executable(buffer_test src/nanoarrow/buffer_test.cc)
   add_executable(array_test src/nanoarrow/array_test.cc)
@@ -220,6 +227,8 @@ if(NANOARROW_BUILD_TESTS)
   add_executable(array_stream_test src/nanoarrow/array_stream_test.cc)
   add_executable(nanoarrow_hpp_test src/nanoarrow/nanoarrow_hpp_test.cc)
   add_executable(nanoarrow_testing_test 
src/nanoarrow/nanoarrow_testing_test.cc)
+  add_executable(c_data_integration_test
+                 src/nanoarrow/integration/c_data_integration_test.cc)
 
   if(NANOARROW_CODE_COVERAGE)
     target_compile_options(coverage_config INTERFACE -O0 -g --coverage)
@@ -250,6 +259,8 @@ if(NANOARROW_BUILD_TESTS)
                         gtest_main
                         nlohmann_json::nlohmann_json
                         coverage_config)
+  target_link_libraries(c_data_integration_test nanoarrow 
nanoarrow_c_data_integration
+                        gtest_main)
 
   include(GoogleTest)
   # Some users have reported a timeout with the default value of 5
@@ -262,4 +273,5 @@ if(NANOARROW_BUILD_TESTS)
   gtest_discover_tests(array_stream_test DISCOVERY_TIMEOUT 10)
   gtest_discover_tests(nanoarrow_hpp_test DISCOVERY_TIMEOUT 10)
   gtest_discover_tests(nanoarrow_testing_test DISCOVERY_TIMEOUT 10)
+  gtest_discover_tests(c_data_integration_test DISCOVERY_TIMEOUT 10)
 endif()
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
index 7bbd5e7..b7d922c 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_files_test.cc
@@ -122,7 +122,9 @@ class TestFile {
 
     // Use testing util to populate the array stream
     nanoarrow::testing::TestingJSONReader reader;
-    NANOARROW_RETURN_NOT_OK(reader.ReadDataFile(json_string, out, error));
+    NANOARROW_RETURN_NOT_OK(reader.ReadDataFile(
+        json_string, out, 
nanoarrow::testing::TestingJSONReader::kNumBatchReadAll,
+        error));
     return NANOARROW_OK;
   }
 
diff --git a/src/nanoarrow/integration/c_data_integration.cc 
b/src/nanoarrow/integration/c_data_integration.cc
new file mode 100644
index 0000000..6c391ec
--- /dev/null
+++ b/src/nanoarrow/integration/c_data_integration.cc
@@ -0,0 +1,224 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <nanoarrow/nanoarrow_testing.hpp>
+#include "c_data_integration.h"
+
+static int64_t kBytesAllocated = 0;
+
+static uint8_t* IntegrationTestReallocate(ArrowBufferAllocator* allocator, 
uint8_t* ptr,
+                                          int64_t old_size, int64_t new_size) {
+  ArrowBufferAllocator default_allocator = ArrowBufferAllocatorDefault();
+  kBytesAllocated -= old_size;
+  uint8_t* out =
+      default_allocator.reallocate(&default_allocator, ptr, old_size, 
new_size);
+  if (out != nullptr) {
+    kBytesAllocated += new_size;
+  }
+
+  return out;
+}
+
+static void IntegrationTestFree(struct ArrowBufferAllocator* allocator, 
uint8_t* ptr,
+                                int64_t size) {
+  ArrowBufferAllocator default_allocator = ArrowBufferAllocatorDefault();
+  kBytesAllocated -= size;
+  default_allocator.free(&default_allocator, ptr, size);
+}
+
+static ArrowBufferAllocator IntegrationTestAllocator() {
+  ArrowBufferAllocator allocator;
+  allocator.reallocate = &IntegrationTestReallocate;
+  allocator.free = &IntegrationTestFree;
+  allocator.private_data = nullptr;
+  return allocator;
+}
+
+static ArrowErrorCode ReadFileString(std::ostream& out, const std::string& 
file_path) {
+  std::ifstream infile(file_path, std::ios::in | std::ios::binary);
+  char buf[8096];
+  do {
+    infile.read(buf, sizeof(buf));
+    out << std::string(buf, infile.gcount());
+  } while (infile.gcount() > 0);
+
+  infile.close();
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrayStreamFromJsonFilePath(const std::string& json_path,
+                                                  ArrowArrayStream* out, int 
num_batch,
+                                                  ArrowError* error) {
+  std::stringstream ss;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ReadFileString(ss, json_path), error);
+
+  nanoarrow::testing::TestingJSONReader reader(IntegrationTestAllocator());
+  NANOARROW_RETURN_NOT_OK(reader.ReadDataFile(ss.str(), out, num_batch, 
error));
+  return NANOARROW_OK;
+}
+
+struct MaterializedArrayStream {
+  nanoarrow::UniqueSchema schema;
+  std::vector<nanoarrow::UniqueArray> arrays;
+};
+
+static ArrowErrorCode MaterializeJsonFilePath(const std::string& json_path,
+                                              MaterializedArrayStream* out, 
int num_batch,
+                                              ArrowError* error) {
+  nanoarrow::UniqueArrayStream stream;
+  NANOARROW_RETURN_NOT_OK(
+      ArrayStreamFromJsonFilePath(json_path, stream.get(), num_batch, error));
+
+  int result = stream->get_schema(stream.get(), out->schema.get());
+  if (result != NANOARROW_OK) {
+    const char* err = stream->get_last_error(stream.get());
+    if (err != nullptr) {
+      ArrowErrorSet(error, "%s", err);
+    }
+  }
+
+  nanoarrow::UniqueArray tmp;
+  do {
+    tmp.reset();
+    int result = stream->get_next(stream.get(), tmp.get());
+    if (result != NANOARROW_OK) {
+      const char* err = stream->get_last_error(stream.get());
+      if (err != nullptr) {
+        ArrowErrorSet(error, "%s", err);
+      }
+
+      return result;
+    }
+
+    if (tmp->release == nullptr) {
+      break;
+    }
+
+    out->arrays.emplace_back(tmp.get());
+  } while (true);
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ExportSchemaFromJson(const char* json_path, ArrowSchema* 
out,
+                                           ArrowError* error) {
+  MaterializedArrayStream data;
+  NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(
+      json_path, &data, 
nanoarrow::testing::TestingJSONReader::kNumBatchOnlySchema,
+      error));
+  ArrowSchemaMove(data.schema.get(), out);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ImportSchemaAndCompareToJson(const char* json_path,
+                                                   ArrowSchema* schema,
+                                                   ArrowError* error) {
+  nanoarrow::UniqueSchema actual(schema);
+
+  MaterializedArrayStream data;
+  NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(
+      json_path, &data, 
nanoarrow::testing::TestingJSONReader::kNumBatchOnlySchema,
+      error));
+
+  nanoarrow::testing::TestingJSONComparison comparison;
+  NANOARROW_RETURN_NOT_OK(
+      comparison.CompareSchema(actual.get(), data.schema.get(), error));
+  if (comparison.num_differences() > 0) {
+    std::stringstream ss;
+    comparison.WriteDifferences(ss);
+    ArrowErrorSet(error, "Found %d differences:\n%s",
+                  static_cast<int>(comparison.num_differences()), 
ss.str().c_str());
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ExportBatchFromJson(const char* json_path, int num_batch,
+                                          ArrowArray* out, ArrowError* error) {
+  MaterializedArrayStream data;
+  NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(json_path, &data, num_batch, 
error));
+
+  ArrowArrayMove(data.arrays[num_batch].get(), out);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ImportBatchAndCompareToJson(const char* json_path, int 
num_batch,
+                                                  ArrowArray* batch, 
ArrowError* error) {
+  nanoarrow::UniqueArray actual(batch);
+
+  MaterializedArrayStream data;
+  NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(json_path, &data, num_batch, 
error));
+
+  nanoarrow::testing::TestingJSONComparison comparison;
+  NANOARROW_RETURN_NOT_OK(comparison.SetSchema(data.schema.get(), error));
+  NANOARROW_RETURN_NOT_OK(
+      comparison.CompareBatch(actual.get(), data.arrays[num_batch].get(), 
error));
+  if (comparison.num_differences() > 0) {
+    std::stringstream ss;
+    comparison.WriteDifferences(ss);
+    ArrowErrorSet(error, "Found %d differences:\n%s",
+                  static_cast<int>(comparison.num_differences()), 
ss.str().c_str());
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+static ArrowError global_error;
+
+static const char* ConvertError(ArrowErrorCode errno_code) {
+  if (errno_code == NANOARROW_OK) {
+    return nullptr;
+  } else {
+    return global_error.message;
+  }
+}
+
+int64_t nanoarrow_BytesAllocated() { return kBytesAllocated; }
+
+const char* nanoarrow_CDataIntegration_ExportSchemaFromJson(const char* 
json_path,
+                                                            ArrowSchema* out) {
+  ArrowErrorInit(&global_error);
+  return ConvertError(ExportSchemaFromJson(json_path, out, &global_error));
+}
+
+const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(const 
char* json_path,
+                                                                    
ArrowSchema* schema) {
+  ArrowErrorInit(&global_error);
+  return ConvertError(ImportSchemaAndCompareToJson(json_path, schema, 
&global_error));
+}
+
+const char* nanoarrow_CDataIntegration_ExportBatchFromJson(const char* 
json_path,
+                                                           int num_batch,
+                                                           ArrowArray* out) {
+  ArrowErrorInit(&global_error);
+  return ConvertError(ExportBatchFromJson(json_path, num_batch, out, 
&global_error));
+}
+
+const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(const char* 
json_path,
+                                                                   int 
num_batch,
+                                                                   ArrowArray* 
batch) {
+  ArrowErrorInit(&global_error);
+  return ConvertError(
+      ImportBatchAndCompareToJson(json_path, num_batch, batch, &global_error));
+}
diff --git a/src/nanoarrow/integration/c_data_integration.h 
b/src/nanoarrow/integration/c_data_integration.h
new file mode 100644
index 0000000..cf76c1b
--- /dev/null
+++ b/src/nanoarrow/integration/c_data_integration.h
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_INTEGRATION_C_DATA_INTEGRATION_H_INCLUDED
+#define NANOARROW_INTEGRATION_C_DATA_INTEGRATION_H_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Extra guard for versions of Arrow without the canonical guard
+#ifndef ARROW_FLAG_DICTIONARY_ORDERED
+
+#ifndef ARROW_C_DATA_INTERFACE
+#define ARROW_C_DATA_INTERFACE
+
+#define ARROW_FLAG_DICTIONARY_ORDERED 1
+#define ARROW_FLAG_NULLABLE 2
+#define ARROW_FLAG_MAP_KEYS_SORTED 4
+
+struct ArrowSchema {
+  // Array type description
+  const char* format;
+  const char* name;
+  const char* metadata;
+  int64_t flags;
+  int64_t n_children;
+  struct ArrowSchema** children;
+  struct ArrowSchema* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowSchema*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+struct ArrowArray {
+  // Array data description
+  int64_t length;
+  int64_t null_count;
+  int64_t offset;
+  int64_t n_buffers;
+  int64_t n_children;
+  const void** buffers;
+  struct ArrowArray** children;
+  struct ArrowArray* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowArray*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DATA_INTERFACE
+#endif  // ARROW_FLAG_DICTIONARY_ORDERED
+
+const char* nanoarrow_CDataIntegration_ExportSchemaFromJson(const char* 
json_path,
+                                                            struct 
ArrowSchema* out);
+
+const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(
+    const char* json_path, struct ArrowSchema* schema);
+
+const char* nanoarrow_CDataIntegration_ExportBatchFromJson(const char* 
json_path,
+                                                           int num_batch,
+                                                           struct ArrowArray* 
out);
+
+const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(
+    const char* json_path, int num_batch, struct ArrowArray* batch);
+
+int64_t nanoarrow_BytesAllocated(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/nanoarrow/integration/c_data_integration_test.cc 
b/src/nanoarrow/integration/c_data_integration_test.cc
new file mode 100644
index 0000000..533dd70
--- /dev/null
+++ b/src/nanoarrow/integration/c_data_integration_test.cc
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdio>
+#include <fstream>
+
+#include <gtest/gtest.h>
+
+#include <nanoarrow/nanoarrow.hpp>
+
+#include "c_data_integration.h"
+
+// Not a true tempfile (writes to working directory), but is possibly more
+// portable than mkstemp()
+class TempFile {
+ public:
+  const char* name() { return name_; }
+
+  ~TempFile() {
+    if (std::remove(name_) != 0) {
+      std::cerr << "Failed to remove '" << name_ << "'\n";
+    }
+  }
+
+ private:
+  const char* name_ = "c_data_integration.tmp.json";
+};
+
+ArrowErrorCode WriteFileString(const std::string& content, const std::string& 
path) {
+  std::ofstream outfile(path, std::ios::out | std::ios::binary);
+  outfile.write(content.data(), content.size());
+  outfile.close();
+  return NANOARROW_OK;
+}
+
+TEST(NanoarrowIntegrationTest, NanoarrowIntegrationTestSchema) {
+  TempFile temp;
+  nanoarrow::UniqueSchema schema;
+
+  // Check error on export
+  ASSERT_EQ(WriteFileString("this is not valid JSON", temp.name()), 
NANOARROW_OK);
+  const char* err =
+      nanoarrow_CDataIntegration_ExportSchemaFromJson(temp.name(), 
schema.get());
+  ASSERT_NE(err, nullptr);
+  ASSERT_EQ(std::string(err).substr(0, 9), "Exception");
+
+  // Check valid roundtrip
+  ASSERT_EQ(WriteFileString(R"({"schema": {"fields": []}, "batches": []})", 
temp.name()),
+            NANOARROW_OK);
+
+  err = nanoarrow_CDataIntegration_ExportSchemaFromJson(temp.name(), 
schema.get());
+  ASSERT_EQ(err, nullptr) << err;
+  ASSERT_NE(schema->release, nullptr);
+
+  err =
+      nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(temp.name(), 
schema.get());
+  ASSERT_EQ(err, nullptr) << err;
+  ASSERT_EQ(schema->release, nullptr);
+
+  // Check roundtrip with differences
+  ASSERT_EQ(WriteFileString(R"({"schema": {"fields": []}, "batches": []})", 
temp.name()),
+            NANOARROW_OK);
+
+  err = nanoarrow_CDataIntegration_ExportSchemaFromJson(temp.name(), 
schema.get());
+  ASSERT_EQ(err, nullptr) << err;
+  ASSERT_NE(schema->release, nullptr);
+
+  // Change underlying JSON so we get differences
+  ASSERT_EQ(
+      WriteFileString(
+          R"({"schema": {"fields": [{"name": "col1", "nullable": true, "type": 
{"name": "null"}, "children": []}]}, "batches": []})",
+          temp.name()),
+      NANOARROW_OK);
+  err =
+      nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(temp.name(), 
schema.get());
+  ASSERT_NE(err, nullptr);
+  ASSERT_EQ(std::string(err).substr(0, 19), "Found 1 differences") << err;
+  ASSERT_EQ(schema->release, nullptr);
+}
+
+TEST(NanoarrowIntegrationTest, NanoarrowIntegrationTestBatch) {
+  TempFile temp;
+  nanoarrow::UniqueArray array;
+  int64_t bytes_allocated_start = nanoarrow_BytesAllocated();
+
+  // Check error on export
+  ASSERT_EQ(WriteFileString("this is not valid JSON", temp.name()), 
NANOARROW_OK);
+  const char* err =
+      nanoarrow_CDataIntegration_ExportBatchFromJson(temp.name(), 0, 
array.get());
+  ASSERT_NE(err, nullptr);
+  ASSERT_EQ(std::string(err).substr(0, 9), "Exception") << err;
+
+  // Check error for invalid batch id
+  ASSERT_EQ(
+      WriteFileString(
+          R"({"schema": {)"
+          R"("fields": [{"name": "col1", "nullable": true, "type": {"name": 
"utf8"}, "children": []}]}, )"
+          R"("batches": [{"count": 1, "columns": [{"name": "col1", "count": 1, 
"VALIDITY": [1], "OFFSET": [0, 3], "DATA": ["abc"]}]}]})",
+          temp.name()),
+      NANOARROW_OK);
+  err = nanoarrow_CDataIntegration_ExportBatchFromJson(temp.name(), 1, 
array.get());
+  ASSERT_EQ(array->release, nullptr);
+  ASSERT_NE(err, nullptr);
+  ASSERT_STREQ(err, "Expected num_batch between 0 and 0 but got 1") << err;
+
+  err = nanoarrow_CDataIntegration_ExportBatchFromJson(temp.name(), 0, 
array.get());
+  ASSERT_EQ(err, nullptr) << err;
+  ASSERT_NE(array->release, nullptr);
+  err =
+      nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(temp.name(), 1, 
array.get());
+  ASSERT_EQ(array->release, nullptr);
+  ASSERT_STREQ(err, "Expected num_batch between 0 and 0 but got 1") << err;
+
+  // Check valid roundtrip
+  err = nanoarrow_CDataIntegration_ExportBatchFromJson(temp.name(), 0, 
array.get());
+  ASSERT_NE(array->release, nullptr);
+  ASSERT_EQ(err, nullptr);
+  ASSERT_GT(nanoarrow_BytesAllocated(), bytes_allocated_start);
+
+  err =
+      nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(temp.name(), 0, 
array.get());
+  ASSERT_EQ(array->release, nullptr);
+  ASSERT_EQ(err, nullptr) << err;
+  ASSERT_EQ(nanoarrow_BytesAllocated(), bytes_allocated_start);
+
+  // Check roundtrip with differences
+  err = nanoarrow_CDataIntegration_ExportBatchFromJson(temp.name(), 0, 
array.get());
+  ASSERT_NE(array->release, nullptr);
+  ASSERT_EQ(err, nullptr) << err;
+  ASSERT_GT(nanoarrow_BytesAllocated(), bytes_allocated_start);
+
+  ASSERT_EQ(
+      WriteFileString(
+          R"({"schema": {)"
+          R"("fields": [{"name": "col1", "nullable": true, "type": {"name": 
"utf8"}, "children": []}]}, )"
+          R"("batches": [{"count": 0, "columns": [{"name": "col1", "count": 0, 
"VALIDITY": [], "OFFSET": [0], "DATA": []}]}]})",
+          temp.name()),
+      NANOARROW_OK);
+  err =
+      nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(temp.name(), 0, 
array.get());
+  ASSERT_NE(err, nullptr);
+  ASSERT_EQ(std::string(err).substr(0, 19), "Found 2 differences") << err;
+  ASSERT_EQ(array->release, nullptr);
+
+  ASSERT_EQ(nanoarrow_BytesAllocated(), bytes_allocated_start);
+}
diff --git a/src/nanoarrow/nanoarrow_testing.hpp 
b/src/nanoarrow/nanoarrow_testing.hpp
index c2502a1..5cd40e4 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -733,11 +733,18 @@ class TestingJSONReader {
   using json = nlohmann::json;
 
  public:
+  TestingJSONReader(ArrowBufferAllocator allocator) : allocator_(allocator) {}
+  TestingJSONReader() : TestingJSONReader(ArrowBufferAllocatorDefault()) {}
+
+  static const int kNumBatchOnlySchema = -2;
+  static const int kNumBatchReadAll = -1;
+
   /// \brief Read JSON representing a data file object
   ///
   /// Read a JSON object in the form `{"schema": {...}, "batches": [...], 
...}`,
   /// propagating `out` on success.
   ArrowErrorCode ReadDataFile(const std::string& data_file_json, 
ArrowArrayStream* out,
+                              int num_batch = kNumBatchReadAll,
                               ArrowError* error = nullptr) {
     try {
       auto obj = json::parse(data_file_json);
@@ -760,18 +767,34 @@ class TestingJSONReader {
       NANOARROW_RETURN_NOT_OK(
           ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), error));
 
+      // Get a vector of batch ids to parse
+      std::vector<size_t> batch_ids;
+      if (num_batch == kNumBatchOnlySchema) {
+        batch_ids.resize(0);
+      } else if (num_batch == kNumBatchReadAll) {
+        batch_ids.resize(batches.size());
+        std::iota(batch_ids.begin(), batch_ids.end(), 0);
+      } else if (num_batch >= 0 && num_batch < batches.size()) {
+        batch_ids.push_back(num_batch);
+      } else {
+        ArrowErrorSet(error, "Expected num_batch between 0 and %d but got %d",
+                      static_cast<int>(batches.size() - 1), num_batch);
+        return EINVAL;
+      }
+
       // Initialize ArrayStream with required capacity
       nanoarrow::UniqueArrayStream stream;
       NANOARROW_RETURN_NOT_OK_WITH_ERROR(
-          ArrowBasicArrayStreamInit(stream.get(), schema.get(), 
batches.size()), error);
+          ArrowBasicArrayStreamInit(stream.get(), schema.get(), 
batch_ids.size()), error);
 
       // Populate ArrayStream batches
-      for (size_t i = 0; i < batches.size(); i++) {
+      for (size_t i = 0; i < batch_ids.size(); i++) {
         nanoarrow::UniqueArray array;
         NANOARROW_RETURN_NOT_OK(
             ArrowArrayInitFromArrayView(array.get(), array_view.get(), error));
+        SetArrayAllocatorRecursive(array.get());
         NANOARROW_RETURN_NOT_OK(
-            SetArrayBatch(batches[i], array_view.get(), array.get(), error));
+            SetArrayBatch(batches[batch_ids[i]], array_view.get(), 
array.get(), error));
         ArrowBasicArrayStreamSetArray(stream.get(), i, array.get());
       }
 
@@ -839,6 +862,7 @@ class TestingJSONReader {
       // ArrowArray to hold memory
       nanoarrow::UniqueArray array;
       NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema, 
error));
+      SetArrayAllocatorRecursive(array.get());
 
       NANOARROW_RETURN_NOT_OK(SetArrayBatch(obj, array_view.get(), 
array.get(), error));
       ArrowArrayMove(array.get(), out);
@@ -867,6 +891,7 @@ class TestingJSONReader {
       // ArrowArray to hold memory
       nanoarrow::UniqueArray array;
       NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema, 
error));
+      SetArrayAllocatorRecursive(array.get());
 
       // Parse the JSON into the array
       NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), 
array.get(), error));
@@ -881,6 +906,8 @@ class TestingJSONReader {
   }
 
  private:
+  ArrowBufferAllocator allocator_;
+
   ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* 
error) {
     NANOARROW_RETURN_NOT_OK(
         Check(value.is_object(), error, "Expected Schema to be a JSON 
object"));
@@ -1713,6 +1740,20 @@ class TestingJSONReader {
     return NANOARROW_OK;
   }
 
+  void SetArrayAllocatorRecursive(ArrowArray* array) {
+    for (int i = 0; i < array->n_buffers; i++) {
+      ArrowArrayBuffer(array, i)->allocator = allocator_;
+    }
+
+    for (int64_t i = 0; i < array->n_children; i++) {
+      SetArrayAllocatorRecursive(array->children[i]);
+    }
+
+    if (array->dictionary != nullptr) {
+      SetArrayAllocatorRecursive(array->dictionary);
+    }
+  }
+
   ArrowErrorCode PrefixError(ArrowErrorCode value, ArrowError* error,
                              const std::string& prefix) {
     if (value != NANOARROW_OK && error != nullptr) {
diff --git a/src/nanoarrow/nanoarrow_testing_test.cc 
b/src/nanoarrow/nanoarrow_testing_test.cc
index d47159f..217bf9d 100644
--- a/src/nanoarrow/nanoarrow_testing_test.cc
+++ b/src/nanoarrow/nanoarrow_testing_test.cc
@@ -824,7 +824,9 @@ TEST(NanoarrowTestingTest, 
NanoarrowTestingTestRoundtripDataFile) {
       R"(]})";
 
   TestingJSONReader reader;
-  ASSERT_EQ(reader.ReadDataFile(data_file_json, stream.get(), &error), 
NANOARROW_OK)
+  ASSERT_EQ(reader.ReadDataFile(data_file_json, stream.get(),
+                                TestingJSONReader::kNumBatchReadAll, &error),
+            NANOARROW_OK)
       << error.message;
 
   TestingJSONWriter writer;
@@ -837,7 +839,9 @@ TEST(NanoarrowTestingTest, 
NanoarrowTestingTestRoundtripDataFile) {
 
   // Check with zero batches
   std::string data_file_json_empty = R"({"schema": {"fields": []}, "batches": 
[]})";
-  ASSERT_EQ(reader.ReadDataFile(data_file_json_empty, stream.get(), &error), 
NANOARROW_OK)
+  ASSERT_EQ(reader.ReadDataFile(data_file_json_empty, stream.get(),
+                                TestingJSONReader::kNumBatchReadAll, &error),
+            NANOARROW_OK)
       << error.message;
   ASSERT_EQ(writer.WriteDataFile(data_file_json_roundtrip, stream.get()), 
NANOARROW_OK);
   EXPECT_EQ(data_file_json_roundtrip.str(), data_file_json_empty);

(arrow-nanoarrow) branch main updated: feat: Add C Data integration test shared library (#337)

Reply via email to