pitrou commented on code in PR #330:
URL: https://github.com/apache/arrow-nanoarrow/pull/330#discussion_r1417072517
##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -1661,6 +1714,305 @@ class TestingJSONReader {
}
};
+/// \brief Integration testing comparison utility
+///
+/// Utility to compare ArrowSchema, ArrowArray, and ArrowArrayStream instances.
+/// This should only be used in the context of integration testing as the
+/// comparison logic is specific to the integration testing JSON files and
+/// specification. Notably:
+///
+/// - Map types are considered equal regardless of the child names "entries",
+/// "key", and "value".
+/// - Float32 and Float64 values are only compared to 3 decimal places.
+class TestingJSONComparison {
+ private:
+ // Internal representation of a human-readable inequality
+ struct Difference {
+ std::string path;
+ std::string actual;
+ std::string expected;
+ };
+
+ public:
+ /// \brief Returns the number of differences found by the previous call
+ size_t num_differences() const { return differences_.size(); }
+
+ /// \brief Dump a human-readable summary of differences to out
+ void WriteDifferences(std::ostream& out) {
+ for (const auto& difference : differences_) {
+ out << "Path: " << difference.path << "\n";
+ out << "- " << difference.actual << "\n";
+ out << "+ " << difference.expected << "\n";
+ out << "\n";
+ }
+ }
+
+ /// \brief Clear any existing differences
+ void ClearDifferences() { differences_.clear(); }
+
+ /// \brief Compare a stream of record batches
+ ///
+ /// Compares actual against expected using the following strategy:
+ ///
+ /// - Compares schemas for equality, returning if differences were found
+ /// - Compares pairs of record batches, returning if one stream finished
+ /// before another.
+ ///
+ /// Returns NANOARROW_OK if the comparison ran without error. Callers must
+ /// query num_differences() to obtain the result of the comparison on
success.
+ ArrowErrorCode CompareArrayStream(ArrowArrayStream* actual,
ArrowArrayStream* expected,
+ ArrowError* error = nullptr) {
+ // Read both schemas
+ nanoarrow::UniqueSchema actual_schema;
+ nanoarrow::UniqueSchema expected_schema;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(actual->get_schema(actual,
actual_schema.get()),
+ error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ expected->get_schema(expected, expected_schema.get()), error);
+
+ // Compare them and return if they are not equal
+ NANOARROW_RETURN_NOT_OK(
+ CompareSchema(expected_schema.get(), actual_schema.get(), error,
"Schema"));
+ if (num_differences() > 0) {
+ return NANOARROW_OK;
+ }
+
+ // Keep a record of the schema to compare batches
+ NANOARROW_RETURN_NOT_OK(SetSchema(expected_schema.get(), error));
+
+ int64_t n_batches = -1;
+ nanoarrow::UniqueArray actual_array;
+ nanoarrow::UniqueArray expected_array;
+ do {
+ n_batches++;
+ std::string batch_label = std::string("Batch ") +
std::to_string(n_batches);
+
+ // Read a batch from each stream
+ actual_array.reset();
+ expected_array.reset();
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(actual->get_next(actual,
actual_array.get()),
+ error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ expected->get_next(expected, expected_array.get()), error);
+
+ // Check the finished/unfinished status of both streams
+ if (actual_array->release == nullptr && expected_array->release !=
nullptr) {
+ differences_.push_back({batch_label, "finished stream", "unfinished
stream"});
+ return NANOARROW_OK;
+ }
+
+ if (actual_array->release != nullptr && expected_array->release ==
nullptr) {
+ differences_.push_back({batch_label, "unfinished stream", "finished
stream"});
+ return NANOARROW_OK;
+ }
+
+ // If both streams are done, break
+ if (actual_array->release == nullptr) {
+ break;
+ }
+
+ // Compare this batch
+ NANOARROW_RETURN_NOT_OK(
+ CompareBatch(actual_array.get(), expected_array.get(), error,
batch_label));
+ } while (true);
+
+ return NANOARROW_OK;
+ }
+
+ /// \brief Compare a top-level ArrowSchema struct
+ ///
+ /// Returns NANOARROW_OK if the comparison ran without error. Callers must
+ /// query num_differences() to obtain the result of the comparison on
success.
+ ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema*
expected,
+ ArrowError* error = nullptr,
+ const std::string& path = "") {
+ // Compare the top-level schema "manually" because (1) map type needs
special-cased
+ // comparison and (2) it's easier to read the output if differences are
separated
+ // by field.
+ ArrowSchemaView actual_view;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaViewInit(&actual_view,
actual, nullptr),
+ error);
+
+ ArrowSchemaView expected_view;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowSchemaViewInit(&expected_view, expected, nullptr), error);
+
+ if (actual_view.type != NANOARROW_TYPE_STRUCT ||
+ expected_view.type != NANOARROW_TYPE_STRUCT) {
+ ArrowErrorSet(error, "Top-level schema must be struct");
+ return EINVAL;
+ }
+
+ // (Purposefully ignore the name field at the top level)
+
+ // Compare flags
+ if (actual->flags != expected->flags) {
+ differences_.push_back({path,
+ std::string(".flags: ") +
std::to_string(actual->flags),
+ std::string(".flags: ") +
std::to_string(expected->flags)});
+ }
+
+ // Compare children
+ if (actual->n_children != expected->n_children) {
+ differences_.push_back(
+ {path, std::string(".n_children: ") +
std::to_string(actual->n_children),
+ std::string(".n_children: ") +
std::to_string(expected->n_children)});
+ } else {
+ for (int64_t i = 0; i < expected->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(CompareField(
+ actual->children[i], expected->children[i], error,
+ path + std::string(".children[") + std::to_string(i) +
std::string("]")));
+ }
+ }
+
+ // Compare metadata
+ std::stringstream ss;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss,
actual->metadata),
+ error);
+ std::string actual_metadata = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss,
expected->metadata),
+ error);
+ std::string expected_metadata = ss.str();
+
+ if (actual_metadata != expected_metadata) {
+ differences_.push_back({path, std::string(".metadata: ") +
actual_metadata,
+ std::string(".metadata: ") + expected_metadata});
+ }
+
+ return NANOARROW_OK;
+ }
+
+ /// \brief Set the ArrowSchema to be used to for future calls to
CompareBatch().
+ ArrowErrorCode SetSchema(const ArrowSchema* schema, ArrowError* error =
nullptr) {
+ schema_.reset();
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(schema,
schema_.get()), error);
+ actual_.reset();
+ expected_.reset();
+
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewInitFromSchema(actual_.get(), schema_.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewInitFromSchema(expected_.get(), schema_.get(), error));
+
+ if (actual_->storage_type != NANOARROW_TYPE_STRUCT) {
+ ArrowErrorSet(error, "Can't SetSchema() with non-struct");
+ return EINVAL;
+ }
+
+ return NANOARROW_OK;
+ }
+
+ /// \brief Compare a top-level ArrowArray struct
+ ///
+ /// Returns NANOARROW_OK if the comparison ran without error. Callers must
+ /// query num_differences() to obtain the result of the comparison on
success.
+ ArrowErrorCode CompareBatch(const ArrowArray* actual, const ArrowArray*
expected,
+ ArrowError* error = nullptr, const std::string&
path = "") {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(expected_.get(), expected,
error));
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(actual_.get(), actual,
error));
+
+ if (actual->offset != expected->offset) {
+ differences_.push_back({path, ".offset: " +
std::to_string(actual->offset),
+ ".offset: " + std::to_string(expected->offset)});
+ }
+
+ if (actual->length != expected->length) {
+ differences_.push_back({path, ".length: " +
std::to_string(actual->length),
+ ".length: " + std::to_string(expected->length)});
+ }
+
+ // ArrowArrayViewSetArray() ensured that number of children of both match
schema
+ for (int64_t i = 0; i < expected_->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(CompareColumn(
+ schema_->children[i], actual_->children[i], expected_->children[i],
error,
+ path + std::string(".children[") + std::to_string(i) + "]"));
+ }
+
+ return NANOARROW_OK;
+ }
+
+ private:
+ TestingJSONWriter writer_;
+ std::vector<Difference> differences_;
+ nanoarrow::UniqueSchema schema_;
+ nanoarrow::UniqueArrayView actual_;
+ nanoarrow::UniqueArrayView expected_;
+
+ ArrowErrorCode CompareField(ArrowSchema* actual, ArrowSchema* expected,
+ ArrowError* error, const std::string& path = "")
{
+ // Preprocess both fields such that map types have canonical names
+ nanoarrow::UniqueSchema actual_copy;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(actual,
actual_copy.get()),
+ error);
+ nanoarrow::UniqueSchema expected_copy;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(expected,
expected_copy.get()),
+ error);
+
+
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(actual_copy.get()),
error);
+
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(expected_copy.get()),
+ error);
+ return CompareFieldBase(actual_copy.get(), expected_copy.get(), error,
path);
+ }
+
+ ArrowErrorCode CompareFieldBase(ArrowSchema* actual, ArrowSchema* expected,
+ ArrowError* error, const std::string& path =
"") {
+ std::stringstream ss;
+
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, expected),
error);
+ std::string expected_json = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, actual), error);
+ std::string actual_json = ss.str();
+
+ if (actual_json != expected_json) {
+ differences_.push_back({path, actual_json, expected_json});
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual,
+ ArrowArrayView* expected, ArrowError* error,
+ const std::string& path = "") {
+ std::stringstream ss;
+
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema,
expected), error);
+ std::string expected_json = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema,
actual), error);
+ std::string actual_json = ss.str();
+
+ if (actual_json != expected_json) {
+ differences_.push_back({path, actual_json, expected_json});
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode ForceMapNamesCanonical(ArrowSchema* schema) {
+ ArrowSchemaView view;
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
+ if (view.type == NANOARROW_TYPE_MAP) {
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0],
"entries"));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaSetName(schema->children[0]->children[0], "key"));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaSetName(schema->children[0]->children[1], "value"));
+ }
+
+ for (int64_t i = 0; i < schema->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->children[i]));
+ }
Review Comment:
Recurse into dictionary as well?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]