This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 475503ef chore: Improve tidiness of nanoarrow_testing.hpp (#667)
475503ef is described below
commit 475503ef551e7e173c5a950f697c8f2ac25dab72
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Oct 29 20:37:51 2024 +0000
chore: Improve tidiness of nanoarrow_testing.hpp (#667)
There were still a number of implementation details left in
nanoarrow_testing.hpp. As part of the effort to improve the testing
utilities and remove the Arrow C++ requirement, a few things might need
to get added here and this seemed like a good place to start.
---
src/nanoarrow/integration/ipc_integration.cc | 1 +
src/nanoarrow/nanoarrow.hpp | 6 +-
src/nanoarrow/nanoarrow_device.hpp | 6 +-
src/nanoarrow/nanoarrow_testing.hpp | 457 ++----------------------
src/nanoarrow/testing/testing.cc | 507 ++++++++++++++++++++++++++-
5 files changed, 524 insertions(+), 453 deletions(-)
diff --git a/src/nanoarrow/integration/ipc_integration.cc
b/src/nanoarrow/integration/ipc_integration.cc
index f0f29673..47ec9392 100644
--- a/src/nanoarrow/integration/ipc_integration.cc
+++ b/src/nanoarrow/integration/ipc_integration.cc
@@ -16,6 +16,7 @@
// under the License.
#include <cstdlib>
+#include <sstream>
#include <nanoarrow/nanoarrow_ipc.hpp>
#include <nanoarrow/nanoarrow_testing.hpp>
diff --git a/src/nanoarrow/nanoarrow.hpp b/src/nanoarrow/nanoarrow.hpp
index 138a4ac1..84125ada 100644
--- a/src/nanoarrow/nanoarrow.hpp
+++ b/src/nanoarrow/nanoarrow.hpp
@@ -15,6 +15,9 @@
// specific language governing permissions and limitations
// under the License.
+#ifndef NANOARROW_HPP_INCLUDED
+#define NANOARROW_HPP_INCLUDED
+
#include <cstring>
#include <exception>
#include <string>
@@ -22,9 +25,6 @@
#include "nanoarrow/nanoarrow.h"
-#ifndef NANOARROW_HPP_INCLUDED
-#define NANOARROW_HPP_INCLUDED
-
/// \defgroup nanoarrow_hpp Nanoarrow C++ Helpers
///
/// The utilities provided in this file are intended to support C++ users
diff --git a/src/nanoarrow/nanoarrow_device.hpp
b/src/nanoarrow/nanoarrow_device.hpp
index c83eaf7f..9b9ff789 100644
--- a/src/nanoarrow/nanoarrow_device.hpp
+++ b/src/nanoarrow/nanoarrow_device.hpp
@@ -15,12 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-#include "nanoarrow/nanoarrow.hpp"
-#include "nanoarrow/nanoarrow_device.h"
-
#ifndef NANOARROW_DEVICE_HPP_INCLUDED
#define NANOARROW_DEVICE_HPP_INCLUDED
+#include "nanoarrow/nanoarrow.hpp"
+#include "nanoarrow/nanoarrow_device.h"
+
namespace nanoarrow {
namespace internal {
diff --git a/src/nanoarrow/nanoarrow_testing.hpp
b/src/nanoarrow/nanoarrow_testing.hpp
index 981e78e2..b26e857d 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -15,18 +15,15 @@
// specific language governing permissions and limitations
// under the License.
-#include <algorithm>
+#ifndef NANOARROW_TESTING_HPP_INCLUDED
+#define NANOARROW_TESTING_HPP_INCLUDED
+
#include <iostream>
-#include <limits>
-#include <sstream>
+#include <memory>
#include <string>
-#include <unordered_map>
#include "nanoarrow/nanoarrow.hpp"
-#ifndef NANOARROW_TESTING_HPP_INCLUDED
-#define NANOARROW_TESTING_HPP_INCLUDED
-
/// \defgroup nanoarrow_testing Nanoarrow Testing Helpers
///
/// Utilities for testing nanoarrow structures and functions.
@@ -35,95 +32,10 @@ namespace nanoarrow {
namespace testing {
+// Forward-declaration of internal types
namespace internal {
-
-// Internal representation of the various structures needed to import and/or
export
-// a dictionary array. We use a serialized version of the dictionary value
because
-// nanoarrow doesn't currently have the ability to copy or reference count an
Array.
-struct Dictionary {
- nanoarrow::UniqueSchema schema;
- int64_t column_length;
- std::string column_json;
-};
-
-class DictionaryContext {
- public:
- DictionaryContext() : next_id_(0) {}
-
- ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema*
values_schema) {
- if (!HasDictionaryForId(dictionary_id)) {
- dictionaries_[dictionary_id] = internal::Dictionary();
- NANOARROW_RETURN_NOT_OK(
- ArrowSchemaDeepCopy(values_schema,
dictionaries_[dictionary_id].schema.get()));
- }
-
- dictionary_ids_[values_schema] = dictionary_id;
- return NANOARROW_OK;
- }
-
- ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t*
dictionary_id) {
- while (HasDictionaryForId(next_id_)) {
- next_id_++;
- }
-
- NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
- *dictionary_id = next_id_++;
- return NANOARROW_OK;
- }
-
- void RecordArray(int32_t dictionary_id, int64_t length, std::string
column_json) {
- dictionaries_[dictionary_id].column_length = length;
- dictionaries_[dictionary_id].column_json = std::move(column_json);
- }
-
- void RecordArray(const ArrowSchema* values_schema, int64_t length,
- std::string column_json) {
- auto ids_it = dictionary_ids_.find(values_schema);
- RecordArray(ids_it->second, length, column_json);
- }
-
- bool empty() { return dictionaries_.empty(); }
-
- void clear() {
- dictionaries_.clear();
- dictionary_ids_.clear();
- next_id_ = 0;
- }
-
- bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
- return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
- }
-
- bool HasDictionaryForId(int32_t dictionary_id) const {
- return dictionaries_.find(dictionary_id) != dictionaries_.end();
- }
-
- const Dictionary& Get(int32_t dictionary_id) const {
- auto dict_it = dictionaries_.find(dictionary_id);
- return dict_it->second;
- }
-
- const Dictionary& Get(const ArrowSchema* values_schema) const {
- auto ids_it = dictionary_ids_.find(values_schema);
- return Get(ids_it->second);
- }
-
- const std::vector<int32_t> GetAllIds() const {
- std::vector<int32_t> out;
- out.reserve(dictionaries_.size());
- for (const auto& value : dictionaries_) {
- out.push_back(value.first);
- }
- return out;
- }
-
- private:
- int32_t next_id_;
- std::unordered_map<int32_t, Dictionary> dictionaries_;
- std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
-};
-
-} // namespace internal
+class DictionaryContext;
+}
/// \defgroup nanoarrow_testing-json Integration test helpers
///
@@ -136,7 +48,8 @@ class DictionaryContext {
/// \brief Writer for the Arrow integration testing JSON format
class TestingJSONWriter {
public:
- TestingJSONWriter() : float_precision_(-1), include_metadata_(true) {}
+ TestingJSONWriter();
+ ~TestingJSONWriter();
/// \brief Set the floating point precision of the writer
///
@@ -151,7 +64,7 @@ class TestingJSONWriter {
/// Use false to skip writing schema/field metadata in the output.
void set_include_metadata(bool value) { include_metadata_ = value; }
- void ResetDictionaries() { dictionaries_.clear(); }
+ void ResetDictionaries();
/// \brief Write an ArrowArrayStream as a data file JSON object to out
///
@@ -195,7 +108,7 @@ class TestingJSONWriter {
private:
int float_precision_;
bool include_metadata_;
- internal::DictionaryContext dictionaries_;
+ std::unique_ptr<internal::DictionaryContext> dictionaries_;
bool ShouldWriteMetadata(const char* metadata) {
return metadata != nullptr && include_metadata_;
@@ -216,8 +129,9 @@ class TestingJSONWriter {
/// \brief Reader for the Arrow integration testing JSON format
class TestingJSONReader {
public:
- TestingJSONReader(ArrowBufferAllocator allocator) : allocator_(allocator) {}
- TestingJSONReader() : TestingJSONReader(ArrowBufferAllocatorDefault()) {}
+ TestingJSONReader(ArrowBufferAllocator allocator);
+ TestingJSONReader();
+ ~TestingJSONReader();
static const int kNumBatchOnlySchema = -2;
static const int kNumBatchReadAll = -1;
@@ -261,7 +175,7 @@ class TestingJSONReader {
private:
ArrowBufferAllocator allocator_;
- internal::DictionaryContext dictionaries_;
+ std::unique_ptr<internal::DictionaryContext> dictionaries_;
void SetArrayAllocatorRecursive(ArrowArray* array);
};
@@ -319,14 +233,7 @@ class TestingJSONComparison {
int64_t num_differences() const { return differences_.size(); }
/// \brief Dump a human-readable summary of differences to out
- void WriteDifferences(std::ostream& out) {
- for (const auto& difference : differences_) {
- out << "Path: " << difference.path << "\n";
- out << "- " << difference.actual << "\n";
- out << "+ " << difference.expected << "\n";
- out << "\n";
- }
- }
+ void WriteDifferences(std::ostream& out);
/// \brief Clear any existing differences
void ClearDifferences() { differences_.clear(); }
@@ -342,175 +249,24 @@ class TestingJSONComparison {
/// Returns NANOARROW_OK if the comparison ran without error. Callers must
/// query num_differences() to obtain the result of the comparison on
success.
ArrowErrorCode CompareArrayStream(ArrowArrayStream* actual,
ArrowArrayStream* expected,
- ArrowError* error = nullptr) {
- // Read both schemas
- nanoarrow::UniqueSchema actual_schema;
- nanoarrow::UniqueSchema expected_schema;
- NANOARROW_RETURN_NOT_OK(
- ArrowArrayStreamGetSchema(actual, actual_schema.get(), error));
- NANOARROW_RETURN_NOT_OK(
- ArrowArrayStreamGetSchema(expected, expected_schema.get(), error));
-
- // Compare them and return if they are not equal
- NANOARROW_RETURN_NOT_OK(
- CompareSchema(expected_schema.get(), actual_schema.get(), error,
"Schema"));
- if (num_differences() > 0) {
- return NANOARROW_OK;
- }
-
- // Keep a record of the schema to compare batches
- NANOARROW_RETURN_NOT_OK(SetSchema(expected_schema.get(), error));
-
- int64_t n_batches = -1;
- nanoarrow::UniqueArray actual_array;
- nanoarrow::UniqueArray expected_array;
- do {
- n_batches++;
- std::string batch_label = std::string("Batch ") +
std::to_string(n_batches);
-
- // Read a batch from each stream
- actual_array.reset();
- expected_array.reset();
- NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetNext(actual,
actual_array.get(), error));
- NANOARROW_RETURN_NOT_OK(
- ArrowArrayStreamGetNext(expected, expected_array.get(), error));
-
- // Check the finished/unfinished status of both streams
- if (actual_array->release == nullptr && expected_array->release !=
nullptr) {
- differences_.push_back({batch_label, "finished stream", "unfinished
stream"});
- return NANOARROW_OK;
- }
-
- if (actual_array->release != nullptr && expected_array->release ==
nullptr) {
- differences_.push_back({batch_label, "unfinished stream", "finished
stream"});
- return NANOARROW_OK;
- }
-
- // If both streams are done, break
- if (actual_array->release == nullptr) {
- break;
- }
-
- // Compare this batch
- NANOARROW_RETURN_NOT_OK(
- CompareBatch(actual_array.get(), expected_array.get(), error,
batch_label));
- } while (true);
-
- return NANOARROW_OK;
- }
+ ArrowError* error = nullptr);
/// \brief Compare a top-level ArrowSchema struct
///
/// Returns NANOARROW_OK if the comparison ran without error. Callers must
/// query num_differences() to obtain the result of the comparison on
success.
ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema*
expected,
- ArrowError* error = nullptr,
- const std::string& path = "") {
- writer_actual_.ResetDictionaries();
- writer_expected_.ResetDictionaries();
-
- // Compare the top-level schema "manually" because (1) map type needs
special-cased
- // comparison and (2) it's easier to read the output if differences are
separated
- // by field.
- ArrowSchemaView actual_view;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaViewInit(&actual_view,
actual, nullptr),
- error);
-
- ArrowSchemaView expected_view;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(
- ArrowSchemaViewInit(&expected_view, expected, nullptr), error);
-
- if (actual_view.type != NANOARROW_TYPE_STRUCT ||
- expected_view.type != NANOARROW_TYPE_STRUCT) {
- ArrowErrorSet(error, "Top-level schema must be struct");
- return EINVAL;
- }
-
- // (Purposefully ignore the name field at the top level)
-
- // Compare flags
- if (compare_batch_flags_ && actual->flags != expected->flags) {
- differences_.push_back({path,
- std::string(".flags: ") +
std::to_string(actual->flags),
- std::string(".flags: ") +
std::to_string(expected->flags)});
- }
-
- // Compare children
- if (actual->n_children != expected->n_children) {
- differences_.push_back(
- {path, std::string(".n_children: ") +
std::to_string(actual->n_children),
- std::string(".n_children: ") +
std::to_string(expected->n_children)});
- } else {
- for (int64_t i = 0; i < expected->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(CompareField(
- actual->children[i], expected->children[i], error,
- path + std::string(".children[") + std::to_string(i) +
std::string("]")));
- }
- }
-
- // Compare metadata
- NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata,
expected->metadata, error,
- path + std::string(".metadata")));
-
- return NANOARROW_OK;
- }
+ ArrowError* error = nullptr, const std::string&
path = "");
/// \brief Set the ArrowSchema to be used to for future calls to
CompareBatch().
- ArrowErrorCode SetSchema(const ArrowSchema* schema, ArrowError* error =
nullptr) {
- schema_.reset();
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(schema,
schema_.get()), error);
- actual_.reset();
- expected_.reset();
-
- NANOARROW_RETURN_NOT_OK(
- ArrowArrayViewInitFromSchema(actual_.get(), schema_.get(), error));
- NANOARROW_RETURN_NOT_OK(
- ArrowArrayViewInitFromSchema(expected_.get(), schema_.get(), error));
-
- if (actual_->storage_type != NANOARROW_TYPE_STRUCT) {
- ArrowErrorSet(error, "Can't SetSchema() with non-struct");
- return EINVAL;
- }
-
- // "Write" the schema using both writers to ensure dictionary ids can be
resolved
- // using the ArrowSchema* pointers from schema_
- std::stringstream ss;
- writer_actual_.ResetDictionaries();
- writer_expected_.ResetDictionaries();
- writer_actual_.WriteSchema(ss, schema_.get());
- writer_expected_.WriteSchema(ss, schema_.get());
-
- return NANOARROW_OK;
- }
+ ArrowErrorCode SetSchema(const ArrowSchema* schema, ArrowError* error =
nullptr);
/// \brief Compare a top-level ArrowArray struct
///
/// Returns NANOARROW_OK if the comparison ran without error. Callers must
/// query num_differences() to obtain the result of the comparison on
success.
ArrowErrorCode CompareBatch(const ArrowArray* actual, const ArrowArray*
expected,
- ArrowError* error = nullptr, const std::string&
path = "") {
- NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(expected_.get(), expected,
error));
- NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(actual_.get(), actual,
error));
-
- if (actual->offset != expected->offset) {
- differences_.push_back({path, ".offset: " +
std::to_string(actual->offset),
- ".offset: " + std::to_string(expected->offset)});
- }
-
- if (actual->length != expected->length) {
- differences_.push_back({path, ".length: " +
std::to_string(actual->length),
- ".length: " + std::to_string(expected->length)});
- }
-
- // ArrowArrayViewSetArray() ensured that number of children of both match
schema
- for (int64_t i = 0; i < expected_->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(CompareColumn(
- schema_->children[i], actual_->children[i], expected_->children[i],
error,
- path + std::string(".children[") + std::to_string(i) + "]"));
- }
-
- return NANOARROW_OK;
- }
+ ArrowError* error = nullptr, const std::string&
path = "");
private:
TestingJSONWriter writer_actual_;
@@ -525,183 +281,20 @@ class TestingJSONComparison {
bool compare_metadata_order_;
ArrowErrorCode CompareField(ArrowSchema* actual, ArrowSchema* expected,
- ArrowError* error, const std::string& path = "")
{
- // Preprocess both fields such that map types have canonical names
- nanoarrow::UniqueSchema actual_copy;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(actual,
actual_copy.get()),
- error);
- nanoarrow::UniqueSchema expected_copy;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(expected,
expected_copy.get()),
- error);
-
-
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(actual_copy.get()),
error);
-
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(expected_copy.get()),
- error);
- return CompareFieldBase(actual_copy.get(), expected_copy.get(), error,
path);
- }
+ ArrowError* error, const std::string& path = "");
ArrowErrorCode CompareFieldBase(ArrowSchema* actual, ArrowSchema* expected,
- ArrowError* error, const std::string& path =
"") {
- std::stringstream ss;
-
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss,
expected), error);
- std::string expected_json = ss.str();
-
- ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual),
error);
- std::string actual_json = ss.str();
-
- if (actual_json != expected_json) {
- differences_.push_back({path, actual_json, expected_json});
- }
-
- NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata,
expected->metadata, error,
- path + std::string(".metadata")));
- return NANOARROW_OK;
- }
+ ArrowError* error, const std::string& path =
"");
ArrowErrorCode CompareMetadata(const char* actual, const char* expected,
- ArrowError* error, const std::string& path =
"") {
- std::stringstream ss;
-
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss,
actual), error);
- std::string actual_json = ss.str();
-
- ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss,
expected),
- error);
- std::string expected_json = ss.str();
-
- bool metadata_equal = actual_json == expected_json;
-
- // If there is a difference in the rendered JSON but we aren't being
strict about
- // order, check again using the KeyValue comparison.
- if (!metadata_equal && !compare_metadata_order_) {
- NANOARROW_RETURN_NOT_OK(
- MetadataEqualKeyValue(actual, expected, &metadata_equal, error));
- }
-
- // If we still have an inequality, add a difference.
- if (!metadata_equal) {
- differences_.push_back({path, actual_json, expected_json});
- }
-
- return NANOARROW_OK;
- }
+ ArrowError* error, const std::string& path =
"");
ArrowErrorCode MetadataEqualKeyValue(const char* actual, const char*
expected,
- bool* out, ArrowError* error) {
- std::unordered_map<std::string, std::string> actual_map, expected_map;
- NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error));
- NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error));
-
- if (actual_map.size() != expected_map.size()) {
- *out = false;
- return NANOARROW_OK;
- }
-
- for (const auto& item : expected_map) {
- const auto& actual_item = actual_map.find(item.first);
- if (actual_item == actual_map.end()) {
- *out = false;
- return NANOARROW_OK;
- }
-
- if (actual_item->second != item.second) {
- *out = false;
- return NANOARROW_OK;
- }
- }
-
- *out = true;
- return NANOARROW_OK;
- }
-
- ArrowErrorCode MetadataToMap(const char* metadata,
- std::unordered_map<std::string, std::string>*
out,
- ArrowError* error) {
- ArrowMetadataReader reader;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader,
metadata), error);
-
- ArrowStringView key, value;
- size_t metadata_num_keys = 0;
- while (reader.remaining_keys > 0) {
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader,
&key, &value),
- error);
- out->insert({std::string(key.data, key.size_bytes),
- std::string(value.data, value.size_bytes)});
- metadata_num_keys++;
- }
-
- if (metadata_num_keys != out->size()) {
- ArrowErrorSet(error,
- "Comparison of metadata containing duplicate keys without "
- "considering order is not implemented");
- return ENOTSUP;
- }
-
- return NANOARROW_OK;
- }
+ bool* out, ArrowError* error);
ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual,
ArrowArrayView* expected, ArrowError* error,
- const std::string& path = "") {
- // Compare children and dictionaries first, then higher-level structures
after.
- // This is a redundant because the higher-level serialized JSON will also
report
- // a difference if deeply nested children have differences; however, it
will not
- // contain dictionaries and this output is slightly better (more targeted
differences
- // that are slightly easier to read appear first).
- for (int64_t i = 0; i < schema->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(
- CompareColumn(schema->children[i], actual->children[i],
expected->children[i],
- error, path + ".children[" + std::to_string(i) + "]"));
- }
-
- if (schema->dictionary != nullptr) {
- NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary,
actual->dictionary,
- expected->dictionary, error,
- path + ".dictionary"));
- }
-
- std::stringstream ss;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss,
schema, expected),
- error);
- std::string expected_json = ss.str();
-
- ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema,
actual),
- error);
- std::string actual_json = ss.str();
-
- if (actual_json != expected_json) {
- differences_.push_back({path, actual_json, expected_json});
- }
-
- return NANOARROW_OK;
- }
-
- ArrowErrorCode ForceMapNamesCanonical(ArrowSchema* schema) {
- ArrowSchemaView view;
- NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
-
- if (view.type == NANOARROW_TYPE_MAP) {
- NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0],
"entries"));
- NANOARROW_RETURN_NOT_OK(
- ArrowSchemaSetName(schema->children[0]->children[0], "key"));
- NANOARROW_RETURN_NOT_OK(
- ArrowSchemaSetName(schema->children[0]->children[1], "value"));
- }
-
- for (int64_t i = 0; i < schema->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->children[i]));
- }
-
- if (schema->dictionary != nullptr) {
- NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->dictionary));
- }
-
- return NANOARROW_OK;
- }
+ const std::string& path = "");
};
/// @}
diff --git a/src/nanoarrow/testing/testing.cc b/src/nanoarrow/testing/testing.cc
index 59f3617c..aba4b11f 100644
--- a/src/nanoarrow/testing/testing.cc
+++ b/src/nanoarrow/testing/testing.cc
@@ -15,6 +15,13 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
#include "nlohmann/json.hpp"
#include "nanoarrow/nanoarrow_testing.hpp"
@@ -23,6 +30,94 @@ namespace nanoarrow {
namespace testing {
+namespace internal {
+// Internal representation of the various structures needed to import and/or
export
+// a dictionary array. We use a serialized version of the dictionary value
because
+// nanoarrow doesn't currently have the ability to copy or reference count an
Array.
+struct Dictionary {
+ nanoarrow::UniqueSchema schema;
+ int64_t column_length;
+ std::string column_json;
+};
+
+class DictionaryContext {
+ public:
+ DictionaryContext() : next_id_(0) {}
+
+ ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema*
values_schema) {
+ if (!HasDictionaryForId(dictionary_id)) {
+ dictionaries_[dictionary_id] = Dictionary();
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaDeepCopy(values_schema,
dictionaries_[dictionary_id].schema.get()));
+ }
+
+ dictionary_ids_[values_schema] = dictionary_id;
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t*
dictionary_id) {
+ while (HasDictionaryForId(next_id_)) {
+ next_id_++;
+ }
+
+ NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
+ *dictionary_id = next_id_++;
+ return NANOARROW_OK;
+ }
+
+ void RecordArray(int32_t dictionary_id, int64_t length, std::string
column_json) {
+ dictionaries_[dictionary_id].column_length = length;
+ dictionaries_[dictionary_id].column_json = std::move(column_json);
+ }
+
+ void RecordArray(const ArrowSchema* values_schema, int64_t length,
+ std::string column_json) {
+ auto ids_it = dictionary_ids_.find(values_schema);
+ RecordArray(ids_it->second, length, column_json);
+ }
+
+ bool empty() { return dictionaries_.empty(); }
+
+ void clear() {
+ dictionaries_.clear();
+ dictionary_ids_.clear();
+ next_id_ = 0;
+ }
+
+ bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
+ return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
+ }
+
+ bool HasDictionaryForId(int32_t dictionary_id) const {
+ return dictionaries_.find(dictionary_id) != dictionaries_.end();
+ }
+
+ const Dictionary& Get(int32_t dictionary_id) const {
+ auto dict_it = dictionaries_.find(dictionary_id);
+ return dict_it->second;
+ }
+
+ const Dictionary& Get(const ArrowSchema* values_schema) const {
+ auto ids_it = dictionary_ids_.find(values_schema);
+ return Get(ids_it->second);
+ }
+
+ const std::vector<int32_t> GetAllIds() const {
+ std::vector<int32_t> out;
+ out.reserve(dictionaries_.size());
+ for (const auto& value : dictionaries_) {
+ out.push_back(value.first);
+ }
+ return out;
+ }
+
+ private:
+ int32_t next_id_;
+ std::unordered_map<int32_t, Dictionary> dictionaries_;
+ std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
+};
+} // namespace internal
+
namespace writer_internal {
namespace {
@@ -511,6 +606,15 @@ ArrowErrorCode WriteMetadataItem(std::ostream& out,
ArrowMetadataReader* reader)
} // namespace writer_internal
+TestingJSONWriter::TestingJSONWriter()
+ : float_precision_(-1),
+ include_metadata_(true),
+ dictionaries_(new internal::DictionaryContext()) {}
+
+TestingJSONWriter::~TestingJSONWriter() = default;
+
+void TestingJSONWriter::ResetDictionaries() { dictionaries_->clear(); }
+
ArrowErrorCode TestingJSONWriter::WriteDataFile(std::ostream& out,
ArrowArrayStream* stream) {
if (stream == nullptr || stream->release == nullptr) {
@@ -550,7 +654,7 @@ ArrowErrorCode
TestingJSONWriter::WriteDataFile(std::ostream& out,
out << "]";
- if (!dictionaries_.empty()) {
+ if (!dictionaries_->empty()) {
out << R"(, "dictionaries": )";
NANOARROW_RETURN_NOT_OK(WriteDictionaryBatches(out));
}
@@ -627,7 +731,7 @@ ArrowErrorCode TestingJSONWriter::WriteField(std::ostream&
out,
int32_t dictionary_id;
NANOARROW_RETURN_NOT_OK(
- dictionaries_.RecordSchema(field->dictionary, &dictionary_id));
+ dictionaries_->RecordSchema(field->dictionary, &dictionary_id));
out << R"(, "dictionary": )";
view.type = view.storage_type;
@@ -804,22 +908,22 @@ ArrowErrorCode
TestingJSONWriter::WriteColumn(std::ostream& out, const ArrowSche
// Write the dictionary values to the DictionaryContext for later if
applicable
if (field->dictionary != nullptr) {
- if (!dictionaries_.HasDictionaryForSchema(field->dictionary)) {
+ if (!dictionaries_->HasDictionaryForSchema(field->dictionary)) {
return EINVAL;
}
std::stringstream dictionary_output;
NANOARROW_RETURN_NOT_OK(
WriteColumn(dictionary_output, field->dictionary, value->dictionary));
- dictionaries_.RecordArray(field->dictionary, value->dictionary->length,
- dictionary_output.str());
+ dictionaries_->RecordArray(field->dictionary, value->dictionary->length,
+ dictionary_output.str());
}
return NANOARROW_OK;
}
ArrowErrorCode TestingJSONWriter::WriteDictionaryBatches(std::ostream& out) {
- std::vector<int32_t> ids = dictionaries_.GetAllIds();
+ std::vector<int32_t> ids = dictionaries_->GetAllIds();
if (ids.empty()) {
out << "[]";
return NANOARROW_OK;
@@ -839,7 +943,7 @@ ArrowErrorCode
TestingJSONWriter::WriteDictionaryBatches(std::ostream& out) {
ArrowErrorCode TestingJSONWriter::WriteDictionaryBatch(std::ostream& out,
int32_t dictionary_id) {
- const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
+ const internal::Dictionary& dict = dictionaries_->Get(dictionary_id);
out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )" <<
dict.column_length
<< R"(, "columns": [)" << dict.column_json << "]}}";
return NANOARROW_OK;
@@ -2136,10 +2240,18 @@ ArrowErrorCode RecordDictionaryBatches(const json&
value,
} // namespace reader_internal
+TestingJSONReader::TestingJSONReader(ArrowBufferAllocator allocator)
+ : allocator_(allocator), dictionaries_(new internal::DictionaryContext())
{}
+
+TestingJSONReader::TestingJSONReader()
+ : TestingJSONReader(ArrowBufferAllocatorDefault()) {}
+
+TestingJSONReader::~TestingJSONReader() = default;
+
ArrowErrorCode TestingJSONReader::ReadDataFile(const std::string&
data_file_json,
ArrowArrayStream* out, int
num_batch,
ArrowError* error) {
- dictionaries_.clear();
+ dictionaries_->clear();
try {
auto obj = nlohmann::json::parse(data_file_json);
@@ -2151,7 +2263,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const
std::string& data_file_json
// Read Schema
nanoarrow::UniqueSchema schema;
NANOARROW_RETURN_NOT_OK(
- reader_internal::SetSchema(schema.get(), obj["schema"], dictionaries_,
error));
+ reader_internal::SetSchema(schema.get(), obj["schema"],
*dictionaries_, error));
NANOARROW_RETURN_NOT_OK(reader_internal::Check(obj.contains("batches"),
error,
"data file missing key
'batches'"));
@@ -2167,7 +2279,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const
std::string& data_file_json
// Record any dictionaries that might be present
if (obj.contains("dictionaries")) {
NANOARROW_RETURN_NOT_OK(reader_internal::RecordDictionaryBatches(
- obj["dictionaries"], dictionaries_, error));
+ obj["dictionaries"], *dictionaries_, error));
}
// Get a vector of batch ids to parse
@@ -2198,7 +2310,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const
std::string& data_file_json
SetArrayAllocatorRecursive(array.get());
NANOARROW_RETURN_NOT_OK(reader_internal::SetArrayBatch(
batches[batch_ids[i]], schema.get(), array_view.get(), array.get(),
- dictionaries_, error));
+ *dictionaries_, error));
ArrowBasicArrayStreamSetArray(stream.get(), i, array.get());
}
@@ -2209,6 +2321,7 @@ ArrowErrorCode TestingJSONReader::ReadDataFile(const
std::string& data_file_json
return EINVAL;
}
}
+
ArrowErrorCode TestingJSONReader::ReadSchema(const std::string& schema_json,
ArrowSchema* out, ArrowError*
error) {
try {
@@ -2216,7 +2329,7 @@ ArrowErrorCode TestingJSONReader::ReadSchema(const
std::string& schema_json,
nanoarrow::UniqueSchema schema;
NANOARROW_RETURN_NOT_OK(
- reader_internal::SetSchema(schema.get(), obj, dictionaries_, error));
+ reader_internal::SetSchema(schema.get(), obj, *dictionaries_, error));
ArrowSchemaMove(schema.get(), out);
return NANOARROW_OK;
} catch (nlohmann::json::exception& e) {
@@ -2224,6 +2337,7 @@ ArrowErrorCode TestingJSONReader::ReadSchema(const
std::string& schema_json,
return EINVAL;
}
}
+
ArrowErrorCode TestingJSONReader::ReadField(const std::string& field_json,
ArrowSchema* out, ArrowError*
error) {
try {
@@ -2231,7 +2345,7 @@ ArrowErrorCode TestingJSONReader::ReadField(const
std::string& field_json,
nanoarrow::UniqueSchema schema;
NANOARROW_RETURN_NOT_OK(
- reader_internal::SetField(schema.get(), obj, dictionaries_, error));
+ reader_internal::SetField(schema.get(), obj, *dictionaries_, error));
ArrowSchemaMove(schema.get(), out);
return NANOARROW_OK;
} catch (nlohmann::json::exception& e) {
@@ -2257,7 +2371,7 @@ ArrowErrorCode TestingJSONReader::ReadBatch(const
std::string& batch_json,
SetArrayAllocatorRecursive(array.get());
NANOARROW_RETURN_NOT_OK(reader_internal::SetArrayBatch(
- obj, schema, array_view.get(), array.get(), dictionaries_, error));
+ obj, schema, array_view.get(), array.get(), *dictionaries_, error));
ArrowArrayMove(array.get(), out);
return NANOARROW_OK;
} catch (nlohmann::json::exception& e) {
@@ -2265,6 +2379,7 @@ ArrowErrorCode TestingJSONReader::ReadBatch(const
std::string& batch_json,
return EINVAL;
}
}
+
ArrowErrorCode TestingJSONReader::ReadColumn(const std::string& column_json,
const ArrowSchema* schema,
ArrowArray* out,
ArrowError* error) {
@@ -2283,7 +2398,7 @@ ArrowErrorCode TestingJSONReader::ReadColumn(const
std::string& column_json,
// Parse the JSON into the array
NANOARROW_RETURN_NOT_OK(reader_internal::SetArrayColumn(
- obj, schema, array_view.get(), array.get(), dictionaries_, error));
+ obj, schema, array_view.get(), array.get(), *dictionaries_, error));
// Return the result
ArrowArrayMove(array.get(), out);
@@ -2308,5 +2423,367 @@ void
TestingJSONReader::SetArrayAllocatorRecursive(ArrowArray* array) {
}
}
+namespace {
+
+ArrowErrorCode MetadataToMap(const char* metadata,
+ std::unordered_map<std::string, std::string>* out,
+ ArrowError* error) {
+ ArrowMetadataReader reader;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader,
metadata), error);
+
+ ArrowStringView key, value;
+ size_t metadata_num_keys = 0;
+ while (reader.remaining_keys > 0) {
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader, &key,
&value),
+ error);
+ out->insert({std::string(key.data, key.size_bytes),
+ std::string(value.data, value.size_bytes)});
+ metadata_num_keys++;
+ }
+
+ if (metadata_num_keys != out->size()) {
+ ArrowErrorSet(error,
+ "Comparison of metadata containing duplicate keys without "
+ "considering order is not implemented");
+ return ENOTSUP;
+ }
+
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode ForceMapNamesCanonical(ArrowSchema* schema) {
+ ArrowSchemaView view;
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
+ if (view.type == NANOARROW_TYPE_MAP) {
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0],
"entries"));
+
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0]->children[0],
"key"));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaSetName(schema->children[0]->children[1], "value"));
+ }
+
+ for (int64_t i = 0; i < schema->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->children[i]));
+ }
+
+ if (schema->dictionary != nullptr) {
+ NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->dictionary));
+ }
+
+ return NANOARROW_OK;
+}
+
+} // namespace
+
+void TestingJSONComparison::WriteDifferences(std::ostream& out) {
+ for (const auto& difference : differences_) {
+ out << "Path: " << difference.path << "\n";
+ out << "- " << difference.actual << "\n";
+ out << "+ " << difference.expected << "\n";
+ out << "\n";
+ }
+}
+
+ArrowErrorCode TestingJSONComparison::CompareArrayStream(ArrowArrayStream*
actual,
+ ArrowArrayStream*
expected,
+ ArrowError* error) {
+ // Read both schemas
+ nanoarrow::UniqueSchema actual_schema;
+ nanoarrow::UniqueSchema expected_schema;
+ NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetSchema(actual,
actual_schema.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayStreamGetSchema(expected, expected_schema.get(), error));
+
+ // Compare them and return if they are not equal
+ NANOARROW_RETURN_NOT_OK(
+ CompareSchema(expected_schema.get(), actual_schema.get(), error,
"Schema"));
+ if (num_differences() > 0) {
+ return NANOARROW_OK;
+ }
+
+ // Keep a record of the schema to compare batches
+ NANOARROW_RETURN_NOT_OK(SetSchema(expected_schema.get(), error));
+
+ int64_t n_batches = -1;
+ nanoarrow::UniqueArray actual_array;
+ nanoarrow::UniqueArray expected_array;
+ do {
+ n_batches++;
+ std::string batch_label = std::string("Batch ") +
std::to_string(n_batches);
+
+ // Read a batch from each stream
+ actual_array.reset();
+ expected_array.reset();
+ NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetNext(actual,
actual_array.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayStreamGetNext(expected, expected_array.get(), error));
+
+ // Check the finished/unfinished status of both streams
+ if (actual_array->release == nullptr && expected_array->release !=
nullptr) {
+ differences_.push_back({batch_label, "finished stream", "unfinished
stream"});
+ return NANOARROW_OK;
+ }
+
+ if (actual_array->release != nullptr && expected_array->release ==
nullptr) {
+ differences_.push_back({batch_label, "unfinished stream", "finished
stream"});
+ return NANOARROW_OK;
+ }
+
+ // If both streams are done, break
+ if (actual_array->release == nullptr) {
+ break;
+ }
+
+ // Compare this batch
+ NANOARROW_RETURN_NOT_OK(
+ CompareBatch(actual_array.get(), expected_array.get(), error,
batch_label));
+ } while (true);
+
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareSchema(const ArrowSchema* actual,
+ const ArrowSchema*
expected,
+ ArrowError* error,
+ const std::string& path) {
+ writer_actual_.ResetDictionaries();
+ writer_expected_.ResetDictionaries();
+
+ // Compare the top-level schema "manually" because (1) map type needs
special-cased
+ // comparison and (2) it's easier to read the output if differences are
separated
+ // by field.
+ ArrowSchemaView actual_view;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaViewInit(&actual_view, actual,
nullptr),
+ error);
+
+ ArrowSchemaView expected_view;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowSchemaViewInit(&expected_view, expected, nullptr), error);
+
+ if (actual_view.type != NANOARROW_TYPE_STRUCT ||
+ expected_view.type != NANOARROW_TYPE_STRUCT) {
+ ArrowErrorSet(error, "Top-level schema must be struct");
+ return EINVAL;
+ }
+
+ // (Purposefully ignore the name field at the top level)
+
+ // Compare flags
+ if (compare_batch_flags_ && actual->flags != expected->flags) {
+ differences_.push_back({path, std::string(".flags: ") +
std::to_string(actual->flags),
+ std::string(".flags: ") +
std::to_string(expected->flags)});
+ }
+
+ // Compare children
+ if (actual->n_children != expected->n_children) {
+ differences_.push_back(
+ {path, std::string(".n_children: ") +
std::to_string(actual->n_children),
+ std::string(".n_children: ") + std::to_string(expected->n_children)});
+ } else {
+ for (int64_t i = 0; i < expected->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(CompareField(
+ actual->children[i], expected->children[i], error,
+ path + std::string(".children[") + std::to_string(i) +
std::string("]")));
+ }
+ }
+
+ // Compare metadata
+ NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata,
expected->metadata, error,
+ path + std::string(".metadata")));
+
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::SetSchema(const ArrowSchema* schema,
+ ArrowError* error) {
+ schema_.reset();
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(schema,
schema_.get()), error);
+ actual_.reset();
+ expected_.reset();
+
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewInitFromSchema(actual_.get(), schema_.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewInitFromSchema(expected_.get(), schema_.get(), error));
+
+ if (actual_->storage_type != NANOARROW_TYPE_STRUCT) {
+ ArrowErrorSet(error, "Can't SetSchema() with non-struct");
+ return EINVAL;
+ }
+
+ // "Write" the schema using both writers to ensure dictionary ids can be
resolved
+ // using the ArrowSchema* pointers from schema_
+ std::stringstream ss;
+ writer_actual_.ResetDictionaries();
+ writer_expected_.ResetDictionaries();
+ writer_actual_.WriteSchema(ss, schema_.get());
+ writer_expected_.WriteSchema(ss, schema_.get());
+
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareBatch(const ArrowArray* actual,
+ const ArrowArray* expected,
+ ArrowError* error,
+ const std::string& path) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(expected_.get(), expected,
error));
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(actual_.get(), actual,
error));
+
+ if (actual->offset != expected->offset) {
+ differences_.push_back({path, ".offset: " + std::to_string(actual->offset),
+ ".offset: " + std::to_string(expected->offset)});
+ }
+
+ if (actual->length != expected->length) {
+ differences_.push_back({path, ".length: " + std::to_string(actual->length),
+ ".length: " + std::to_string(expected->length)});
+ }
+
+ // ArrowArrayViewSetArray() ensured that number of children of both match
schema
+ for (int64_t i = 0; i < expected_->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(
+ CompareColumn(schema_->children[i], actual_->children[i],
expected_->children[i],
+ error, path + std::string(".children[") +
std::to_string(i) + "]"));
+ }
+
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareField(ArrowSchema* actual,
+ ArrowSchema* expected,
+ ArrowError* error,
+ const std::string& path) {
+ // Preprocess both fields such that map types have canonical names
+ nanoarrow::UniqueSchema actual_copy;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(actual,
actual_copy.get()),
+ error);
+ nanoarrow::UniqueSchema expected_copy;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(expected,
expected_copy.get()),
+ error);
+
+
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(actual_copy.get()),
error);
+
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(expected_copy.get()),
error);
+ return CompareFieldBase(actual_copy.get(), expected_copy.get(), error, path);
+}
+
+ArrowErrorCode TestingJSONComparison::CompareFieldBase(ArrowSchema* actual,
+ ArrowSchema* expected,
+ ArrowError* error,
+ const std::string&
path) {
+ std::stringstream ss;
+
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss,
expected), error);
+ std::string expected_json = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual),
error);
+ std::string actual_json = ss.str();
+
+ if (actual_json != expected_json) {
+ differences_.push_back({path, actual_json, expected_json});
+ }
+
+ NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata,
expected->metadata, error,
+ path + std::string(".metadata")));
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareMetadata(const char* actual,
+ const char* expected,
+ ArrowError* error,
+ const std::string& path)
{
+ std::stringstream ss;
+
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, actual),
error);
+ std::string actual_json = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss,
expected), error);
+ std::string expected_json = ss.str();
+
+ bool metadata_equal = actual_json == expected_json;
+
+ // If there is a difference in the rendered JSON but we aren't being strict
about
+ // order, check again using the KeyValue comparison.
+ if (!metadata_equal && !compare_metadata_order_) {
+ NANOARROW_RETURN_NOT_OK(
+ MetadataEqualKeyValue(actual, expected, &metadata_equal, error));
+ }
+
+ // If we still have an inequality, add a difference.
+ if (!metadata_equal) {
+ differences_.push_back({path, actual_json, expected_json});
+ }
+
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode TestingJSONComparison::CompareColumn(ArrowSchema* schema,
+ ArrowArrayView* actual,
+ ArrowArrayView* expected,
+ ArrowError* error,
+ const std::string& path) {
+ // Compare children and dictionaries first, then higher-level structures
after.
+ // This is a redundant because the higher-level serialized JSON will also
report
+ // a difference if deeply nested children have differences; however, it will
not
+ // contain dictionaries and this output is slightly better (more targeted
differences
+ // that are slightly easier to read appear first).
+ for (int64_t i = 0; i < schema->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(CompareColumn(schema->children[i],
actual->children[i],
+ expected->children[i], error,
+ path + ".children[" +
std::to_string(i) + "]"));
+ }
+
+ if (schema->dictionary != nullptr) {
+ NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary,
actual->dictionary,
+ expected->dictionary, error,
+ path + ".dictionary"));
+ }
+
+ std::stringstream ss;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss, schema,
expected),
+ error);
+ std::string expected_json = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema,
actual),
+ error);
+ std::string actual_json = ss.str();
+
+ if (actual_json != expected_json) {
+ differences_.push_back({path, actual_json, expected_json});
+ }
+
+ return NANOARROW_OK;
+}
+ArrowErrorCode TestingJSONComparison::MetadataEqualKeyValue(const char* actual,
+ const char*
expected,
+ bool* out,
+ ArrowError* error)
{
+ std::unordered_map<std::string, std::string> actual_map, expected_map;
+ NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error));
+ NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error));
+
+ if (actual_map.size() != expected_map.size()) {
+ *out = false;
+ return NANOARROW_OK;
+ }
+
+ for (const auto& item : expected_map) {
+ const auto& actual_item = actual_map.find(item.first);
+ if (actual_item == actual_map.end()) {
+ *out = false;
+ return NANOARROW_OK;
+ }
+
+ if (actual_item->second != item.second) {
+ *out = false;
+ return NANOARROW_OK;
+ }
+ }
+
+ *out = true;
+ return NANOARROW_OK;
+}
} // namespace testing
} // namespace nanoarrow