(arrow-nanoarrow) branch main updated: chore: Improve tidiness of nanoarrow_testing.hpp (#667)

paleolimbot Tue, 29 Oct 2024 13:38:06 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 475503ef chore: Improve tidiness of nanoarrow_testing.hpp (#667)
475503ef is described below

commit 475503ef551e7e173c5a950f697c8f2ac25dab72
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Oct 29 20:37:51 2024 +0000

    chore: Improve tidiness of nanoarrow_testing.hpp (#667)
    
    There were still a number of implementation details left in
    nanoarrow_testing.hpp. As part of the effort to improve the testing
    utilities and remove the Arrow C++ requirement, a few things might need
    to get added here and this seemed like a good place to start.
---
 src/nanoarrow/integration/ipc_integration.cc |   1 +
 src/nanoarrow/nanoarrow.hpp                  |   6 +-
 src/nanoarrow/nanoarrow_device.hpp           |   6 +-
 src/nanoarrow/nanoarrow_testing.hpp          | 457 ++----------------------
 src/nanoarrow/testing/testing.cc             | 507 ++++++++++++++++++++++++++-
 5 files changed, 524 insertions(+), 453 deletions(-)

diff --git a/src/nanoarrow/integration/ipc_integration.cc 
b/src/nanoarrow/integration/ipc_integration.cc
index f0f29673..47ec9392 100644
--- a/src/nanoarrow/integration/ipc_integration.cc
+++ b/src/nanoarrow/integration/ipc_integration.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <cstdlib>
+#include <sstream>
 
 #include <nanoarrow/nanoarrow_ipc.hpp>
 #include <nanoarrow/nanoarrow_testing.hpp>
diff --git a/src/nanoarrow/nanoarrow.hpp b/src/nanoarrow/nanoarrow.hpp
index 138a4ac1..84125ada 100644
--- a/src/nanoarrow/nanoarrow.hpp
+++ b/src/nanoarrow/nanoarrow.hpp
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#ifndef NANOARROW_HPP_INCLUDED
+#define NANOARROW_HPP_INCLUDED
+
 #include <cstring>
 #include <exception>
 #include <string>
@@ -22,9 +25,6 @@
 
 #include "nanoarrow/nanoarrow.h"
 
-#ifndef NANOARROW_HPP_INCLUDED
-#define NANOARROW_HPP_INCLUDED
-
 /// \defgroup nanoarrow_hpp Nanoarrow C++ Helpers
 ///
 /// The utilities provided in this file are intended to support C++ users
diff --git a/src/nanoarrow/nanoarrow_device.hpp 
b/src/nanoarrow/nanoarrow_device.hpp
index c83eaf7f..9b9ff789 100644
--- a/src/nanoarrow/nanoarrow_device.hpp
+++ b/src/nanoarrow/nanoarrow_device.hpp
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "nanoarrow/nanoarrow.hpp"
-#include "nanoarrow/nanoarrow_device.h"
-
 #ifndef NANOARROW_DEVICE_HPP_INCLUDED
 #define NANOARROW_DEVICE_HPP_INCLUDED
 
+#include "nanoarrow/nanoarrow.hpp"
+#include "nanoarrow/nanoarrow_device.h"
+
 namespace nanoarrow {
 
 namespace internal {
diff --git a/src/nanoarrow/nanoarrow_testing.hpp 
b/src/nanoarrow/nanoarrow_testing.hpp
index 981e78e2..b26e857d 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -15,18 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>
+#ifndef NANOARROW_TESTING_HPP_INCLUDED
+#define NANOARROW_TESTING_HPP_INCLUDED
+
 #include <iostream>
-#include <limits>
-#include <sstream>
+#include <memory>
 #include <string>
-#include <unordered_map>
 
 #include "nanoarrow/nanoarrow.hpp"
 
-#ifndef NANOARROW_TESTING_HPP_INCLUDED
-#define NANOARROW_TESTING_HPP_INCLUDED
-
 /// \defgroup nanoarrow_testing Nanoarrow Testing Helpers
 ///
 /// Utilities for testing nanoarrow structures and functions.
@@ -35,95 +32,10 @@ namespace nanoarrow {
 
 namespace testing {
 
+// Forward-declaration of internal types
 namespace internal {
-
-// Internal representation of the various structures needed to import and/or 
export
-// a dictionary array. We use a serialized version of the dictionary value 
because
-// nanoarrow doesn't currently have the ability to copy or reference count an 
Array.
-struct Dictionary {
-  nanoarrow::UniqueSchema schema;
-  int64_t column_length;
-  std::string column_json;
-};
-
-class DictionaryContext {
- public:
-  DictionaryContext() : next_id_(0) {}
-
-  ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema* 
values_schema) {
-    if (!HasDictionaryForId(dictionary_id)) {
-      dictionaries_[dictionary_id] = internal::Dictionary();
-      NANOARROW_RETURN_NOT_OK(
-          ArrowSchemaDeepCopy(values_schema, 
dictionaries_[dictionary_id].schema.get()));
-    }
-
-    dictionary_ids_[values_schema] = dictionary_id;
-    return NANOARROW_OK;
-  }
-
-  ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t* 
dictionary_id) {
-    while (HasDictionaryForId(next_id_)) {
-      next_id_++;
-    }
-
-    NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
-    *dictionary_id = next_id_++;
-    return NANOARROW_OK;
-  }
-
-  void RecordArray(int32_t dictionary_id, int64_t length, std::string 
column_json) {
-    dictionaries_[dictionary_id].column_length = length;
-    dictionaries_[dictionary_id].column_json = std::move(column_json);
-  }
-
-  void RecordArray(const ArrowSchema* values_schema, int64_t length,
-                   std::string column_json) {
-    auto ids_it = dictionary_ids_.find(values_schema);
-    RecordArray(ids_it->second, length, column_json);
-  }
-
-  bool empty() { return dictionaries_.empty(); }
-
-  void clear() {
-    dictionaries_.clear();
-    dictionary_ids_.clear();
-    next_id_ = 0;
-  }
-
-  bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
-    return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
-  }
-
-  bool HasDictionaryForId(int32_t dictionary_id) const {
-    return dictionaries_.find(dictionary_id) != dictionaries_.end();
-  }
-
-  const Dictionary& Get(int32_t dictionary_id) const {
-    auto dict_it = dictionaries_.find(dictionary_id);
-    return dict_it->second;
-  }
-
-  const Dictionary& Get(const ArrowSchema* values_schema) const {
-    auto ids_it = dictionary_ids_.find(values_schema);
-    return Get(ids_it->second);
-  }
-
-  const std::vector<int32_t> GetAllIds() const {
-    std::vector<int32_t> out;
-    out.reserve(dictionaries_.size());
-    for (const auto& value : dictionaries_) {
-      out.push_back(value.first);
-    }
-    return out;
-  }
-
- private:
-  int32_t next_id_;
-  std::unordered_map<int32_t, Dictionary> dictionaries_;
-  std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
-};
-
-}  // namespace internal
+class DictionaryContext;
+}
 
 /// \defgroup nanoarrow_testing-json Integration test helpers
 ///
@@ -136,7 +48,8 @@ class DictionaryContext {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
-  TestingJSONWriter() : float_precision_(-1), include_metadata_(true) {}
+  TestingJSONWriter();
+  ~TestingJSONWriter();
 
   /// \brief Set the floating point precision of the writer
   ///
@@ -151,7 +64,7 @@ class TestingJSONWriter {
   /// Use false to skip writing schema/field metadata in the output.
   void set_include_metadata(bool value) { include_metadata_ = value; }
 
-  void ResetDictionaries() { dictionaries_.clear(); }
+  void ResetDictionaries();
 
   /// \brief Write an ArrowArrayStream as a data file JSON object to out
   ///
@@ -195,7 +108,7 @@ class TestingJSONWriter {
  private:
   int float_precision_;
   bool include_metadata_;
-  internal::DictionaryContext dictionaries_;
+  std::unique_ptr<internal::DictionaryContext> dictionaries_;
 
   bool ShouldWriteMetadata(const char* metadata) {
     return metadata != nullptr && include_metadata_;
@@ -216,8 +129,9 @@ class TestingJSONWriter {
 /// \brief Reader for the Arrow integration testing JSON format
 class TestingJSONReader {
  public:
-  TestingJSONReader(ArrowBufferAllocator allocator) : allocator_(allocator) {}
-  TestingJSONReader() : TestingJSONReader(ArrowBufferAllocatorDefault()) {}
+  TestingJSONReader(ArrowBufferAllocator allocator);
+  TestingJSONReader();
+  ~TestingJSONReader();
 
   static const int kNumBatchOnlySchema = -2;
   static const int kNumBatchReadAll = -1;
@@ -261,7 +175,7 @@ class TestingJSONReader {
 
  private:
   ArrowBufferAllocator allocator_;
-  internal::DictionaryContext dictionaries_;
+  std::unique_ptr<internal::DictionaryContext> dictionaries_;
 
   void SetArrayAllocatorRecursive(ArrowArray* array);
 };
@@ -319,14 +233,7 @@ class TestingJSONComparison {
   int64_t num_differences() const { return differences_.size(); }
 
   /// \brief Dump a human-readable summary of differences to out
-  void WriteDifferences(std::ostream& out) {
-    for (const auto& difference : differences_) {
-      out << "Path: " << difference.path << "\n";
-      out << "- " << difference.actual << "\n";
-      out << "+ " << difference.expected << "\n";
-      out << "\n";
-    }
-  }
+  void WriteDifferences(std::ostream& out);
 
   /// \brief Clear any existing differences
   void ClearDifferences() { differences_.clear(); }
@@ -342,175 +249,24 @@ class TestingJSONComparison {
   /// Returns NANOARROW_OK if the comparison ran without error. Callers must
   /// query num_differences() to obtain the result of the comparison on 
success.
   ArrowErrorCode CompareArrayStream(ArrowArrayStream* actual, 
ArrowArrayStream* expected,
-                                    ArrowError* error = nullptr) {
-    // Read both schemas
-    nanoarrow::UniqueSchema actual_schema;
-    nanoarrow::UniqueSchema expected_schema;
-    NANOARROW_RETURN_NOT_OK(
-        ArrowArrayStreamGetSchema(actual, actual_schema.get(), error));
-    NANOARROW_RETURN_NOT_OK(
-        ArrowArrayStreamGetSchema(expected, expected_schema.get(), error));
-
-    // Compare them and return if they are not equal
-    NANOARROW_RETURN_NOT_OK(
-        CompareSchema(expected_schema.get(), actual_schema.get(), error, 
"Schema"));
-    if (num_differences() > 0) {
-      return NANOARROW_OK;
-    }
-
-    // Keep a record of the schema to compare batches
-    NANOARROW_RETURN_NOT_OK(SetSchema(expected_schema.get(), error));
-
-    int64_t n_batches = -1;
-    nanoarrow::UniqueArray actual_array;
-    nanoarrow::UniqueArray expected_array;
-    do {
-      n_batches++;
-      std::string batch_label = std::string("Batch ") + 
std::to_string(n_batches);
-
-      // Read a batch from each stream
-      actual_array.reset();
-      expected_array.reset();
-      NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetNext(actual, 
actual_array.get(), error));
-      NANOARROW_RETURN_NOT_OK(
-          ArrowArrayStreamGetNext(expected, expected_array.get(), error));
-
-      // Check the finished/unfinished status of both streams
-      if (actual_array->release == nullptr && expected_array->release != 
nullptr) {
-        differences_.push_back({batch_label, "finished stream", "unfinished 
stream"});
-        return NANOARROW_OK;
-      }
-
-      if (actual_array->release != nullptr && expected_array->release == 
nullptr) {
-        differences_.push_back({batch_label, "unfinished stream", "finished 
stream"});
-        return NANOARROW_OK;
-      }
-
-      // If both streams are done, break
-      if (actual_array->release == nullptr) {
-        break;
-      }
-
-      // Compare this batch
-      NANOARROW_RETURN_NOT_OK(
-          CompareBatch(actual_array.get(), expected_array.get(), error, 
batch_label));
-    } while (true);
-
-    return NANOARROW_OK;
-  }
+                                    ArrowError* error = nullptr);
 
   /// \brief Compare a top-level ArrowSchema struct
   ///
   /// Returns NANOARROW_OK if the comparison ran without error. Callers must
   /// query num_differences() to obtain the result of the comparison on 
success.
   ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema* 
expected,
-                               ArrowError* error = nullptr,
-                               const std::string& path = "") {
-    writer_actual_.ResetDictionaries();
-    writer_expected_.ResetDictionaries();
-
-    // Compare the top-level schema "manually" because (1) map type needs 
special-cased
-    // comparison and (2) it's easier to read the output if differences are 
separated
-    // by field.
-    ArrowSchemaView actual_view;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaViewInit(&actual_view, 
actual, nullptr),
-                                       error);
-
-    ArrowSchemaView expected_view;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
-        ArrowSchemaViewInit(&expected_view, expected, nullptr), error);
-
-    if (actual_view.type != NANOARROW_TYPE_STRUCT ||
-        expected_view.type != NANOARROW_TYPE_STRUCT) {
-      ArrowErrorSet(error, "Top-level schema must be struct");
-      return EINVAL;
-    }
-
-    // (Purposefully ignore the name field at the top level)
-
-    // Compare flags
-    if (compare_batch_flags_ && actual->flags != expected->flags) {
-      differences_.push_back({path,
-                              std::string(".flags: ") + 
std::to_string(actual->flags),
-                              std::string(".flags: ") + 
std::to_string(expected->flags)});
-    }
-
-    // Compare children
-    if (actual->n_children != expected->n_children) {
-      differences_.push_back(
-          {path, std::string(".n_children: ") + 
std::to_string(actual->n_children),
-           std::string(".n_children: ") + 
std::to_string(expected->n_children)});
-    } else {
-      for (int64_t i = 0; i < expected->n_children; i++) {
-        NANOARROW_RETURN_NOT_OK(CompareField(
-            actual->children[i], expected->children[i], error,
-            path + std::string(".children[") + std::to_string(i) + 
std::string("]")));
-      }
-    }
-
-    // Compare metadata
-    NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, 
expected->metadata, error,
-                                            path + std::string(".metadata")));
-
-    return NANOARROW_OK;
-  }
+                               ArrowError* error = nullptr, const std::string& 
path = "");
 
   /// \brief Set the ArrowSchema to be used to for future calls to 
CompareBatch().
-  ArrowErrorCode SetSchema(const ArrowSchema* schema, ArrowError* error = 
nullptr) {
-    schema_.reset();
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(schema, 
schema_.get()), error);
-    actual_.reset();
-    expected_.reset();
-
-    NANOARROW_RETURN_NOT_OK(
-        ArrowArrayViewInitFromSchema(actual_.get(), schema_.get(), error));
-    NANOARROW_RETURN_NOT_OK(
-        ArrowArrayViewInitFromSchema(expected_.get(), schema_.get(), error));
-
-    if (actual_->storage_type != NANOARROW_TYPE_STRUCT) {
-      ArrowErrorSet(error, "Can't SetSchema() with non-struct");
-      return EINVAL;
-    }
-
-    // "Write" the schema using both writers to ensure dictionary ids can be 
resolved
-    // using the ArrowSchema* pointers from schema_
-    std::stringstream ss;
-    writer_actual_.ResetDictionaries();
-    writer_expected_.ResetDictionaries();
-    writer_actual_.WriteSchema(ss, schema_.get());
-    writer_expected_.WriteSchema(ss, schema_.get());
-
-    return NANOARROW_OK;
-  }
+  ArrowErrorCode SetSchema(const ArrowSchema* schema, ArrowError* error = 
nullptr);
 
   /// \brief Compare a top-level ArrowArray struct
   ///
   /// Returns NANOARROW_OK if the comparison ran without error. Callers must
   /// query num_differences() to obtain the result of the comparison on 
success.
   ArrowErrorCode CompareBatch(const ArrowArray* actual, const ArrowArray* 
expected,
-                              ArrowError* error = nullptr, const std::string& 
path = "") {
-    NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(expected_.get(), expected, 
error));
-    NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(actual_.get(), actual, 
error));
-
-    if (actual->offset != expected->offset) {
-      differences_.push_back({path, ".offset: " + 
std::to_string(actual->offset),
-                              ".offset: " + std::to_string(expected->offset)});
-    }
-
-    if (actual->length != expected->length) {
-      differences_.push_back({path, ".length: " + 
std::to_string(actual->length),
-                              ".length: " + std::to_string(expected->length)});
-    }
-
-    // ArrowArrayViewSetArray() ensured that number of children of both match 
schema
-    for (int64_t i = 0; i < expected_->n_children; i++) {
-      NANOARROW_RETURN_NOT_OK(CompareColumn(
-          schema_->children[i], actual_->children[i], expected_->children[i], 
error,
-          path + std::string(".children[") + std::to_string(i) + "]"));
-    }
-
-    return NANOARROW_OK;
-  }
+                              ArrowError* error = nullptr, const std::string& 
path = "");
 
  private:
   TestingJSONWriter writer_actual_;
@@ -525,183 +281,20 @@ class TestingJSONComparison {
   bool compare_metadata_order_;
 
   ArrowErrorCode CompareField(ArrowSchema* actual, ArrowSchema* expected,
-                              ArrowError* error, const std::string& path = "") 
{
-    // Preprocess both fields such that map types have canonical names
-    nanoarrow::UniqueSchema actual_copy;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(actual, 
actual_copy.get()),
-                                       error);
-    nanoarrow::UniqueSchema expected_copy;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(expected, 
expected_copy.get()),
-                                       error);
-
-    
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(actual_copy.get()), 
error);
-    
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(expected_copy.get()),
-                                       error);
-    return CompareFieldBase(actual_copy.get(), expected_copy.get(), error, 
path);
-  }
+                              ArrowError* error, const std::string& path = "");
 
   ArrowErrorCode CompareFieldBase(ArrowSchema* actual, ArrowSchema* expected,
-                                  ArrowError* error, const std::string& path = 
"") {
-    std::stringstream ss;
-
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss, 
expected), error);
-    std::string expected_json = ss.str();
-
-    ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual), 
error);
-    std::string actual_json = ss.str();
-
-    if (actual_json != expected_json) {
-      differences_.push_back({path, actual_json, expected_json});
-    }
-
-    NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, 
expected->metadata, error,
-                                            path + std::string(".metadata")));
-    return NANOARROW_OK;
-  }
+                                  ArrowError* error, const std::string& path = 
"");
 
   ArrowErrorCode CompareMetadata(const char* actual, const char* expected,
-                                 ArrowError* error, const std::string& path = 
"") {
-    std::stringstream ss;
-
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, 
actual), error);
-    std::string actual_json = ss.str();
-
-    ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss, 
expected),
-                                       error);
-    std::string expected_json = ss.str();
-
-    bool metadata_equal = actual_json == expected_json;
-
-    // If there is a difference in the rendered JSON but we aren't being 
strict about
-    // order, check again using the KeyValue comparison.
-    if (!metadata_equal && !compare_metadata_order_) {
-      NANOARROW_RETURN_NOT_OK(
-          MetadataEqualKeyValue(actual, expected, &metadata_equal, error));
-    }
-
-    // If we still have an inequality, add a difference.
-    if (!metadata_equal) {
-      differences_.push_back({path, actual_json, expected_json});
-    }
-
-    return NANOARROW_OK;
-  }
+                                 ArrowError* error, const std::string& path = 
"");
 
   ArrowErrorCode MetadataEqualKeyValue(const char* actual, const char* 
expected,
-                                       bool* out, ArrowError* error) {
-    std::unordered_map<std::string, std::string> actual_map, expected_map;
-    NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error));
-    NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error));
-
-    if (actual_map.size() != expected_map.size()) {
-      *out = false;
-      return NANOARROW_OK;
-    }
-
-    for (const auto& item : expected_map) {
-      const auto& actual_item = actual_map.find(item.first);
-      if (actual_item == actual_map.end()) {
-        *out = false;
-        return NANOARROW_OK;
-      }
-
-      if (actual_item->second != item.second) {
-        *out = false;
-        return NANOARROW_OK;
-      }
-    }
-
-    *out = true;
-    return NANOARROW_OK;
-  }
-
-  ArrowErrorCode MetadataToMap(const char* metadata,
-                               std::unordered_map<std::string, std::string>* 
out,
-                               ArrowError* error) {
-    ArrowMetadataReader reader;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader, 
metadata), error);
-
-    ArrowStringView key, value;
-    size_t metadata_num_keys = 0;
-    while (reader.remaining_keys > 0) {
-      NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader, 
&key, &value),
-                                         error);
-      out->insert({std::string(key.data, key.size_bytes),
-                   std::string(value.data, value.size_bytes)});
-      metadata_num_keys++;
-    }
-
-    if (metadata_num_keys != out->size()) {
-      ArrowErrorSet(error,
-                    "Comparison of metadata containing duplicate keys without "
-                    "considering order is not implemented");
-      return ENOTSUP;
-    }
-
-    return NANOARROW_OK;
-  }
+                                       bool* out, ArrowError* error);
 
   ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual,
                                ArrowArrayView* expected, ArrowError* error,
-                               const std::string& path = "") {
-    // Compare children and dictionaries first, then higher-level structures 
after.
-    // This is a redundant because the higher-level serialized JSON will also 
report
-    // a difference if deeply nested children have differences; however, it 
will not
-    // contain dictionaries and this output is slightly better (more targeted 
differences
-    // that are slightly easier to read appear first).
-    for (int64_t i = 0; i < schema->n_children; i++) {
-      NANOARROW_RETURN_NOT_OK(
-          CompareColumn(schema->children[i], actual->children[i], 
expected->children[i],
-                        error, path + ".children[" + std::to_string(i) + "]"));
-    }
-
-    if (schema->dictionary != nullptr) {
-      NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary, 
actual->dictionary,
-                                            expected->dictionary, error,
-                                            path + ".dictionary"));
-    }
-
-    std::stringstream ss;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss, 
schema, expected),
-                                       error);
-    std::string expected_json = ss.str();
-
-    ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema, 
actual),
-                                       error);
-    std::string actual_json = ss.str();
-
-    if (actual_json != expected_json) {
-      differences_.push_back({path, actual_json, expected_json});
-    }
-
-    return NANOARROW_OK;
-  }
-
-  ArrowErrorCode ForceMapNamesCanonical(ArrowSchema* schema) {
-    ArrowSchemaView view;
-    NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
-
-    if (view.type == NANOARROW_TYPE_MAP) {
-      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], 
"entries"));
-      NANOARROW_RETURN_NOT_OK(
-          ArrowSchemaSetName(schema->children[0]->children[0], "key"));
-      NANOARROW_RETURN_NOT_OK(
-          ArrowSchemaSetName(schema->children[0]->children[1], "value"));
-    }
-
-    for (int64_t i = 0; i < schema->n_children; i++) {
-      NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->children[i]));
-    }
-
-    if (schema->dictionary != nullptr) {
-      NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->dictionary));
-    }
-
-    return NANOARROW_OK;
-  }
+                               const std::string& path = "");
 };
 
 /// @}
diff --git a/src/nanoarrow/testing/testing.cc b/src/nanoarrow/testing/testing.cc
index 59f3617c..aba4b11f 100644
--- a/src/nanoarrow/testing/testing.cc
+++ b/src/nanoarrow/testing/testing.cc
@@ -15,6 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
 #include "nlohmann/json.hpp"
 
 #include "nanoarrow/nanoarrow_testing.hpp"
@@ -23,6 +30,94 @@ namespace nanoarrow {
 
 namespace testing {
 
+namespace internal {
+// Internal representation of the various structures needed to import and/or 
export
+// a dictionary array. We use a serialized version of the dictionary value 
because
+// nanoarrow doesn't currently have the ability to copy or reference count an 
Array.
+struct Dictionary {
+  nanoarrow::UniqueSchema schema;
+  int64_t column_length;
+  std::string column_json;
+};
+
+class DictionaryContext {
+ public:
+  DictionaryContext() : next_id_(0) {}
+
+  ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema* 
values_schema) {
+    if (!HasDictionaryForId(dictionary_id)) {
+      dictionaries_[dictionary_id] = Dictionary();
+      NANOARROW_RETURN_NOT_OK(
+          ArrowSchemaDeepCopy(values_schema, 
dictionaries_[dictionary_id].schema.get()));
+    }
+
+    dictionary_ids_[values_schema] = dictionary_id;
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t* 
dictionary_id) {
+    while (HasDictionaryForId(next_id_)) {
+      next_id_++;
+    }
+
+    NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
+    *dictionary_id = next_id_++;
+    return NANOARROW_OK;
+  }
+
+  void RecordArray(int32_t dictionary_id, int64_t length, std::string 
column_json) {
+    dictionaries_[dictionary_id].column_length = length;
+    dictionaries_[dictionary_id].column_json = std::move(column_json);
+  }
+
+  void RecordArray(const ArrowSchema* values_schema, int64_t length,
+                   std::string column_json) {
+    auto ids_it = dictionary_ids_.find(values_schema);
+    RecordArray(ids_it->second, length, column_json);
+  }
+
+  bool empty() { return dictionaries_.empty(); }
+
+  void clear() {
+    dictionaries_.clear();
+    dictionary_ids_.clear();
+    next_id_ = 0;
+  }
+
+  bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
+    return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
+  }
+
+  bool HasDictionaryForId(int32_t dictionary_id) const {
+    return dictionaries_.find(dictionary_id) != dictionaries_.end();
+  }
+
+  const Dictionary& Get(int32_t dictionary_id) const {
+    auto dict_it = dictionaries_.find(dictionary_id);
+    return dict_it->second;
+  }
+
+  const Dictionary& Get(const ArrowSchema* values_schema) const {
+    auto ids_it = dictionary_ids_.find(values_schema);
+    return Get(ids_it->second);
+  }
+
+  const std::vector<int32_t> GetAllIds() const {
+    std::vector<int32_t> out;
+    out.reserve(dictionaries_.size());
+    for (const auto& value : dictionaries_) {
+      out.push_back(value.first);
+    }
+    return out;
+  }
+
+ private:
+  int32_t next_id_;
+  std::unordered_map<int32_t, Dictionary> dictionaries_;
+  std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
+};
+}  // namespace internal
+
 namespace writer_internal {
 
 namespace {
@@ -511,6 +606,15 @@ ArrowErrorCode WriteMetadataItem(std::ostream& out, 
ArrowMetadataReader* reader)
 
 }  // namespace writer_internal
 
+TestingJSONWriter::TestingJSONWriter()
+    : float_precision_(-1),
+      include_metadata_(true),
+      dictionaries_(new internal::DictionaryContext()) {}
+
+TestingJSONWriter::~TestingJSONWriter() = default;
+
+void TestingJSONWriter::ResetDictionaries() { dictionaries_->clear(); }
+
 ArrowErrorCode TestingJSONWriter::WriteDataFile(std::ostream& out,
                                                 ArrowArrayStream* stream) {
   if (stream == nullptr || stream->release == nullptr) {
@@ -550,7 +654,7 @@ ArrowErrorCode 
TestingJSONWriter::WriteDataFile(std::ostream& out,
 
   out << "]";
 
-  if (!dictionaries_.empty()) {
+  if (!dictionaries_->empty()) {
     out << R"(, "dictionaries": )";
     NANOARROW_RETURN_NOT_OK(WriteDictionaryBatches(out));
   }
@@ -627,7 +731,7 @@ ArrowErrorCode TestingJSONWriter::WriteField(std::ostream& 
out,
 
     int32_t dictionary_id;
     NANOARROW_RETURN_NOT_OK(
-        dictionaries_.RecordSchema(field->dictionary, &dictionary_id));
+        dictionaries_->RecordSchema(field->dictionary, &dictionary_id));
 
     out << R"(, "dictionary": )";
     view.type = view.storage_type;
@@ -804,22 +908,22 @@ ArrowErrorCode 
TestingJSONWriter::WriteColumn(std::ostream& out, const ArrowSche
 
   // Write the dictionary values to the DictionaryContext for later if 
applicable
   if (field->dictionary != nullptr) {
-    if (!dictionaries_.HasDictionaryForSchema(field->dictionary)) {
+    if (!dictionaries_->HasDictionaryForSchema(field->dictionary)) {
       return EINVAL;
     }
 
     std::stringstream dictionary_output;
     NANOARROW_RETURN_NOT_OK(
         WriteColumn(dictionary_output, field->dictionary, value->dictionary));
-    dictionaries_.RecordArray(field->dictionary, value->dictionary->length,
-                              dictionary_output.str());
+    dictionaries_->RecordArray(field->dictionary, value->dictionary->length,
+                               dictionary_output.str());
   }
 
   return NANOARROW_OK;
 }
 
 ArrowErrorCode TestingJSONWriter::WriteDictionaryBatches(std::ostream& out) {
-  std::vector<int32_t> ids = dictionaries_.GetAllIds();
+  std::vector<int32_t> ids = dictionaries_->GetAllIds();
   if (ids.empty()) {
     out << "[]";
     return NANOARROW_OK;
@@ -839,7 +943,7 @@ ArrowErrorCode 
TestingJSONWriter::WriteDictionaryBatches(std::ostream& out) {
 
 ArrowErrorCode TestingJSONWriter::WriteDictionaryBatch(std::ostream& out,
                                                        int32_t dictionary_id) {
-  const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
+  const internal::Dictionary& dict = dictionaries_->Get(dictionary_id);
   out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )" << 
dict.column_length
       << R"(, "columns": [)" << dict.column_json << "]}}";
   return NANOARROW_OK;
@@ -2136,10 +2240,18 @@ ArrowErrorCode RecordDictionaryBatches(const json& 
value,
 
 }  // namespace reader_internal
 
+TestingJSONReader::TestingJSONReader(ArrowBufferAllocator allocator)
+    : allocator_(allocator), dictionaries_(new internal::DictionaryContext()) 
{}
+
+TestingJSONReader::TestingJSONReader()
+    : TestingJSONReader(ArrowBufferAllocatorDefault()) {}
+
+TestingJSONReader::~TestingJSONReader() = default;
+
 ArrowErrorCode TestingJSONReader::ReadDataFile(const std::string& 
data_file_json,
                                                ArrowArrayStream* out, int 
num_batch,
                                                ArrowError* error) {
-  dictionaries_.clear();
+  dictionaries_->clear();
 
   try {
     auto obj = nlohmann::json::parse(data_file_json);
@@ -2151,7 +2263,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const 
std::string& data_file_json
     // Read Schema
     nanoarrow::UniqueSchema schema;
     NANOARROW_RETURN_NOT_OK(
-        reader_internal::SetSchema(schema.get(), obj["schema"], dictionaries_, 
error));
+        reader_internal::SetSchema(schema.get(), obj["schema"], 
*dictionaries_, error));
 
     NANOARROW_RETURN_NOT_OK(reader_internal::Check(obj.contains("batches"), 
error,
                                                    "data file missing key 
'batches'"));
@@ -2167,7 +2279,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const 
std::string& data_file_json
     // Record any dictionaries that might be present
     if (obj.contains("dictionaries")) {
       NANOARROW_RETURN_NOT_OK(reader_internal::RecordDictionaryBatches(
-          obj["dictionaries"], dictionaries_, error));
+          obj["dictionaries"], *dictionaries_, error));
     }
 
     // Get a vector of batch ids to parse
@@ -2198,7 +2310,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const 
std::string& data_file_json
       SetArrayAllocatorRecursive(array.get());
       NANOARROW_RETURN_NOT_OK(reader_internal::SetArrayBatch(
           batches[batch_ids[i]], schema.get(), array_view.get(), array.get(),
-          dictionaries_, error));
+          *dictionaries_, error));
       ArrowBasicArrayStreamSetArray(stream.get(), i, array.get());
     }
 
@@ -2209,6 +2321,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const 
std::string& data_file_json
     return EINVAL;
   }
 }
+
 ArrowErrorCode TestingJSONReader::ReadSchema(const std::string& schema_json,
                                              ArrowSchema* out, ArrowError* 
error) {
   try {
@@ -2216,7 +2329,7 @@ ArrowErrorCode TestingJSONReader::ReadSchema(const 
std::string& schema_json,
     nanoarrow::UniqueSchema schema;
 
     NANOARROW_RETURN_NOT_OK(
-        reader_internal::SetSchema(schema.get(), obj, dictionaries_, error));
+        reader_internal::SetSchema(schema.get(), obj, *dictionaries_, error));
     ArrowSchemaMove(schema.get(), out);
     return NANOARROW_OK;
   } catch (nlohmann::json::exception& e) {
@@ -2224,6 +2337,7 @@ ArrowErrorCode TestingJSONReader::ReadSchema(const 
std::string& schema_json,
     return EINVAL;
   }
 }
+
 ArrowErrorCode TestingJSONReader::ReadField(const std::string& field_json,
                                             ArrowSchema* out, ArrowError* 
error) {
   try {
@@ -2231,7 +2345,7 @@ ArrowErrorCode TestingJSONReader::ReadField(const 
std::string& field_json,
     nanoarrow::UniqueSchema schema;
 
     NANOARROW_RETURN_NOT_OK(
-        reader_internal::SetField(schema.get(), obj, dictionaries_, error));
+        reader_internal::SetField(schema.get(), obj, *dictionaries_, error));
     ArrowSchemaMove(schema.get(), out);
     return NANOARROW_OK;
   } catch (nlohmann::json::exception& e) {
@@ -2257,7 +2371,7 @@ ArrowErrorCode TestingJSONReader::ReadBatch(const 
std::string& batch_json,
     SetArrayAllocatorRecursive(array.get());
 
     NANOARROW_RETURN_NOT_OK(reader_internal::SetArrayBatch(
-        obj, schema, array_view.get(), array.get(), dictionaries_, error));
+        obj, schema, array_view.get(), array.get(), *dictionaries_, error));
     ArrowArrayMove(array.get(), out);
     return NANOARROW_OK;
   } catch (nlohmann::json::exception& e) {
@@ -2265,6 +2379,7 @@ ArrowErrorCode TestingJSONReader::ReadBatch(const 
std::string& batch_json,
     return EINVAL;
   }
 }
+
 ArrowErrorCode TestingJSONReader::ReadColumn(const std::string& column_json,
                                              const ArrowSchema* schema, 
ArrowArray* out,
                                              ArrowError* error) {
@@ -2283,7 +2398,7 @@ ArrowErrorCode TestingJSONReader::ReadColumn(const 
std::string& column_json,
 
     // Parse the JSON into the array
     NANOARROW_RETURN_NOT_OK(reader_internal::SetArrayColumn(
-        obj, schema, array_view.get(), array.get(), dictionaries_, error));
+        obj, schema, array_view.get(), array.get(), *dictionaries_, error));
 
     // Return the result
     ArrowArrayMove(array.get(), out);
@@ -2308,5 +2423,367 @@ void 
TestingJSONReader::SetArrayAllocatorRecursive(ArrowArray* array) {
   }
 }
 
+namespace {
+
+ArrowErrorCode MetadataToMap(const char* metadata,
+                             std::unordered_map<std::string, std::string>* out,
+                             ArrowError* error) {
+  ArrowMetadataReader reader;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader, 
metadata), error);
+
+  ArrowStringView key, value;
+  size_t metadata_num_keys = 0;
+  while (reader.remaining_keys > 0) {
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader, &key, 
&value),
+                                       error);
+    out->insert({std::string(key.data, key.size_bytes),
+                 std::string(value.data, value.size_bytes)});
+    metadata_num_keys++;
+  }
+
+  if (metadata_num_keys != out->size()) {
+    ArrowErrorSet(error,
+                  "Comparison of metadata containing duplicate keys without "
+                  "considering order is not implemented");
+    return ENOTSUP;
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ForceMapNamesCanonical(ArrowSchema* schema) {
+  ArrowSchemaView view;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
+  if (view.type == NANOARROW_TYPE_MAP) {
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], 
"entries"));
+    
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0]->children[0], 
"key"));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowSchemaSetName(schema->children[0]->children[1], "value"));
+  }
+
+  for (int64_t i = 0; i < schema->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->children[i]));
+  }
+
+  if (schema->dictionary != nullptr) {
+    NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->dictionary));
+  }
+
+  return NANOARROW_OK;
+}
+
+}  // namespace
+
+void TestingJSONComparison::WriteDifferences(std::ostream& out) {
+  for (const auto& difference : differences_) {
+    out << "Path: " << difference.path << "\n";
+    out << "- " << difference.actual << "\n";
+    out << "+ " << difference.expected << "\n";
+    out << "\n";
+  }
+}
+
+ArrowErrorCode TestingJSONComparison::CompareArrayStream(ArrowArrayStream* 
actual,
+                                                         ArrowArrayStream* 
expected,
+                                                         ArrowError* error) {
+  // Read both schemas
+  nanoarrow::UniqueSchema actual_schema;
+  nanoarrow::UniqueSchema expected_schema;
+  NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetSchema(actual, 
actual_schema.get(), error));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowArrayStreamGetSchema(expected, expected_schema.get(), error));
+
+  // Compare them and return if they are not equal
+  NANOARROW_RETURN_NOT_OK(
+      CompareSchema(expected_schema.get(), actual_schema.get(), error, 
"Schema"));
+  if (num_differences() > 0) {
+    return NANOARROW_OK;
+  }
+
+  // Keep a record of the schema to compare batches
+  NANOARROW_RETURN_NOT_OK(SetSchema(expected_schema.get(), error));
+
+  int64_t n_batches = -1;
+  nanoarrow::UniqueArray actual_array;
+  nanoarrow::UniqueArray expected_array;
+  do {
+    n_batches++;
+    std::string batch_label = std::string("Batch ") + 
std::to_string(n_batches);
+
+    // Read a batch from each stream
+    actual_array.reset();
+    expected_array.reset();
+    NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetNext(actual, 
actual_array.get(), error));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayStreamGetNext(expected, expected_array.get(), error));
+
+    // Check the finished/unfinished status of both streams
+    if (actual_array->release == nullptr && expected_array->release != 
nullptr) {
+      differences_.push_back({batch_label, "finished stream", "unfinished 
stream"});
+      return NANOARROW_OK;
+    }
+
+    if (actual_array->release != nullptr && expected_array->release == 
nullptr) {
+      differences_.push_back({batch_label, "unfinished stream", "finished 
stream"});
+      return NANOARROW_OK;
+    }
+
+    // If both streams are done, break
+    if (actual_array->release == nullptr) {
+      break;
+    }
+
+    // Compare this batch
+    NANOARROW_RETURN_NOT_OK(
+        CompareBatch(actual_array.get(), expected_array.get(), error, 
batch_label));
+  } while (true);
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareSchema(const ArrowSchema* actual,
+                                                    const ArrowSchema* 
expected,
+                                                    ArrowError* error,
+                                                    const std::string& path) {
+  writer_actual_.ResetDictionaries();
+  writer_expected_.ResetDictionaries();
+
+  // Compare the top-level schema "manually" because (1) map type needs 
special-cased
+  // comparison and (2) it's easier to read the output if differences are 
separated
+  // by field.
+  ArrowSchemaView actual_view;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaViewInit(&actual_view, actual, 
nullptr),
+                                     error);
+
+  ArrowSchemaView expected_view;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowSchemaViewInit(&expected_view, expected, nullptr), error);
+
+  if (actual_view.type != NANOARROW_TYPE_STRUCT ||
+      expected_view.type != NANOARROW_TYPE_STRUCT) {
+    ArrowErrorSet(error, "Top-level schema must be struct");
+    return EINVAL;
+  }
+
+  // (Purposefully ignore the name field at the top level)
+
+  // Compare flags
+  if (compare_batch_flags_ && actual->flags != expected->flags) {
+    differences_.push_back({path, std::string(".flags: ") + 
std::to_string(actual->flags),
+                            std::string(".flags: ") + 
std::to_string(expected->flags)});
+  }
+
+  // Compare children
+  if (actual->n_children != expected->n_children) {
+    differences_.push_back(
+        {path, std::string(".n_children: ") + 
std::to_string(actual->n_children),
+         std::string(".n_children: ") + std::to_string(expected->n_children)});
+  } else {
+    for (int64_t i = 0; i < expected->n_children; i++) {
+      NANOARROW_RETURN_NOT_OK(CompareField(
+          actual->children[i], expected->children[i], error,
+          path + std::string(".children[") + std::to_string(i) + 
std::string("]")));
+    }
+  }
+
+  // Compare metadata
+  NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, 
expected->metadata, error,
+                                          path + std::string(".metadata")));
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::SetSchema(const ArrowSchema* schema,
+                                                ArrowError* error) {
+  schema_.reset();
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(schema, 
schema_.get()), error);
+  actual_.reset();
+  expected_.reset();
+
+  NANOARROW_RETURN_NOT_OK(
+      ArrowArrayViewInitFromSchema(actual_.get(), schema_.get(), error));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowArrayViewInitFromSchema(expected_.get(), schema_.get(), error));
+
+  if (actual_->storage_type != NANOARROW_TYPE_STRUCT) {
+    ArrowErrorSet(error, "Can't SetSchema() with non-struct");
+    return EINVAL;
+  }
+
+  // "Write" the schema using both writers to ensure dictionary ids can be 
resolved
+  // using the ArrowSchema* pointers from schema_
+  std::stringstream ss;
+  writer_actual_.ResetDictionaries();
+  writer_expected_.ResetDictionaries();
+  writer_actual_.WriteSchema(ss, schema_.get());
+  writer_expected_.WriteSchema(ss, schema_.get());
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareBatch(const ArrowArray* actual,
+                                                   const ArrowArray* expected,
+                                                   ArrowError* error,
+                                                   const std::string& path) {
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(expected_.get(), expected, 
error));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(actual_.get(), actual, 
error));
+
+  if (actual->offset != expected->offset) {
+    differences_.push_back({path, ".offset: " + std::to_string(actual->offset),
+                            ".offset: " + std::to_string(expected->offset)});
+  }
+
+  if (actual->length != expected->length) {
+    differences_.push_back({path, ".length: " + std::to_string(actual->length),
+                            ".length: " + std::to_string(expected->length)});
+  }
+
+  // ArrowArrayViewSetArray() ensured that number of children of both match 
schema
+  for (int64_t i = 0; i < expected_->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        CompareColumn(schema_->children[i], actual_->children[i], 
expected_->children[i],
+                      error, path + std::string(".children[") + 
std::to_string(i) + "]"));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareField(ArrowSchema* actual,
+                                                   ArrowSchema* expected,
+                                                   ArrowError* error,
+                                                   const std::string& path) {
+  // Preprocess both fields such that map types have canonical names
+  nanoarrow::UniqueSchema actual_copy;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(actual, 
actual_copy.get()),
+                                     error);
+  nanoarrow::UniqueSchema expected_copy;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(expected, 
expected_copy.get()),
+                                     error);
+
+  
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(actual_copy.get()), 
error);
+  
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(expected_copy.get()), 
error);
+  return CompareFieldBase(actual_copy.get(), expected_copy.get(), error, path);
+}
+
+ArrowErrorCode TestingJSONComparison::CompareFieldBase(ArrowSchema* actual,
+                                                       ArrowSchema* expected,
+                                                       ArrowError* error,
+                                                       const std::string& 
path) {
+  std::stringstream ss;
+
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss, 
expected), error);
+  std::string expected_json = ss.str();
+
+  ss.str("");
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual), 
error);
+  std::string actual_json = ss.str();
+
+  if (actual_json != expected_json) {
+    differences_.push_back({path, actual_json, expected_json});
+  }
+
+  NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, 
expected->metadata, error,
+                                          path + std::string(".metadata")));
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareMetadata(const char* actual,
+                                                      const char* expected,
+                                                      ArrowError* error,
+                                                      const std::string& path) 
{
+  std::stringstream ss;
+
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, actual), 
error);
+  std::string actual_json = ss.str();
+
+  ss.str("");
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss, 
expected), error);
+  std::string expected_json = ss.str();
+
+  bool metadata_equal = actual_json == expected_json;
+
+  // If there is a difference in the rendered JSON but we aren't being strict 
about
+  // order, check again using the KeyValue comparison.
+  if (!metadata_equal && !compare_metadata_order_) {
+    NANOARROW_RETURN_NOT_OK(
+        MetadataEqualKeyValue(actual, expected, &metadata_equal, error));
+  }
+
+  // If we still have an inequality, add a difference.
+  if (!metadata_equal) {
+    differences_.push_back({path, actual_json, expected_json});
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareColumn(ArrowSchema* schema,
+                                                    ArrowArrayView* actual,
+                                                    ArrowArrayView* expected,
+                                                    ArrowError* error,
+                                                    const std::string& path) {
+  // Compare children and dictionaries first, then higher-level structures 
after.
+  // This is a redundant because the higher-level serialized JSON will also 
report
+  // a difference if deeply nested children have differences; however, it will 
not
+  // contain dictionaries and this output is slightly better (more targeted 
differences
+  // that are slightly easier to read appear first).
+  for (int64_t i = 0; i < schema->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(CompareColumn(schema->children[i], 
actual->children[i],
+                                          expected->children[i], error,
+                                          path + ".children[" + 
std::to_string(i) + "]"));
+  }
+
+  if (schema->dictionary != nullptr) {
+    NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary, 
actual->dictionary,
+                                          expected->dictionary, error,
+                                          path + ".dictionary"));
+  }
+
+  std::stringstream ss;
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss, schema, 
expected),
+                                     error);
+  std::string expected_json = ss.str();
+
+  ss.str("");
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema, 
actual),
+                                     error);
+  std::string actual_json = ss.str();
+
+  if (actual_json != expected_json) {
+    differences_.push_back({path, actual_json, expected_json});
+  }
+
+  return NANOARROW_OK;
+}
+ArrowErrorCode TestingJSONComparison::MetadataEqualKeyValue(const char* actual,
+                                                            const char* 
expected,
+                                                            bool* out,
+                                                            ArrowError* error) 
{
+  std::unordered_map<std::string, std::string> actual_map, expected_map;
+  NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error));
+  NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error));
+
+  if (actual_map.size() != expected_map.size()) {
+    *out = false;
+    return NANOARROW_OK;
+  }
+
+  for (const auto& item : expected_map) {
+    const auto& actual_item = actual_map.find(item.first);
+    if (actual_item == actual_map.end()) {
+      *out = false;
+      return NANOARROW_OK;
+    }
+
+    if (actual_item->second != item.second) {
+      *out = false;
+      return NANOARROW_OK;
+    }
+  }
+
+  *out = true;
+  return NANOARROW_OK;
+}
 }  // namespace testing
 }  // namespace nanoarrow

(arrow-nanoarrow) branch main updated: chore: Improve tidiness of nanoarrow_testing.hpp (#667)

Reply via email to