This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new cfae94b4 feat: Add `ArrowArrayViewCompare()` to check for array 
equality (#578)
cfae94b4 is described below

commit cfae94b4aa45742cfe4611808d9711d4df6eb6a9
Author: Dewey Dunnington <[email protected]>
AuthorDate: Fri Aug 9 22:37:38 2024 -0300

    feat: Add `ArrowArrayViewCompare()` to check for array equality (#578)
    
    This PR is one possible component to address #577. While in some cases
    we want a more relaxed comparison that allows (for example) arrays with
    the same content to be considered equal even if they have different
    content in null slots, in some cases we really do want an exact match.
    This PR adds `ArrowArrayViewCompare()` in such a way that the same
    signature could be used to apply the equality check at a more relaxed
    validation level when this is implemented in a future PR, but only
    implements the "identical" level since this is the easiest/most pressing
    (applies to IPC validation).
    
    The messages given by the implementation give the location of the
    difference but not what the difference actually was. Knowing where the
    error was is usually sufficient for a higher level runtime (e.g., R,
    Python, C++) to give a fancier message if they want or need to.
---
 src/nanoarrow/common/array.c        | 131 ++++++++++++++++++++++++++
 src/nanoarrow/common/array_test.cc  | 183 ++++++++++++++++++++++++++++++++++++
 src/nanoarrow/common/inline_types.h |  11 +++
 src/nanoarrow/nanoarrow.h           |  14 +++
 4 files changed, 339 insertions(+)

diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c
index 1e774dd9..be99cab8 100644
--- a/src/nanoarrow/common/array.c
+++ b/src/nanoarrow/common/array.c
@@ -17,6 +17,7 @@
 
 #include <errno.h>
 #include <inttypes.h>
+#include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -1335,3 +1336,133 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
   ArrowErrorSet(error, "validation_level not recognized");
   return EINVAL;
 }
+
+struct ArrowComparisonInternalState {
+  enum ArrowCompareLevel level;
+  int is_equal;
+  struct ArrowError* reason;
+};
+
+NANOARROW_CHECK_PRINTF_ATTRIBUTE static void ArrowComparePrependPath(
+    struct ArrowError* out, const char* fmt, ...) {
+  if (out == NULL) {
+    return;
+  }
+
+  char prefix[128];
+  prefix[0] = '\0';
+  va_list args;
+  va_start(args, fmt);
+  int prefix_len = vsnprintf(prefix, sizeof(prefix), fmt, args);
+  va_end(args);
+
+  if (prefix_len <= 0) {
+    return;
+  }
+
+  size_t out_len = strlen(out->message);
+  size_t out_len_to_move = sizeof(struct ArrowError) - prefix_len - 1;
+  if (out_len_to_move > out_len) {
+    out_len_to_move = out_len;
+  }
+
+  memmove(out->message + prefix_len, out->message, out_len_to_move);
+  memcpy(out->message, prefix, prefix_len);
+  out->message[out_len + prefix_len] = '\0';
+}
+
+#define SET_NOT_EQUAL_AND_RETURN_IF_IMPL(cond_, state_, reason_) \
+  do {                                                           \
+    if (cond_) {                                                 \
+      ArrowErrorSet(state_->reason, ": %s", reason_);            \
+      state_->is_equal = 0;                                      \
+      return;                                                    \
+    }                                                            \
+  } while (0)
+
+#define SET_NOT_EQUAL_AND_RETURN_IF(condition_, state_) \
+  SET_NOT_EQUAL_AND_RETURN_IF_IMPL(condition_, state_, #condition_)
+
+static void ArrowArrayViewCompareBuffer(const struct ArrowArrayView* actual,
+                                        const struct ArrowArrayView* expected, 
int i,
+                                        struct ArrowComparisonInternalState* 
state) {
+  SET_NOT_EQUAL_AND_RETURN_IF(
+      actual->buffer_views[i].size_bytes != 
expected->buffer_views[i].size_bytes, state);
+
+  int64_t buffer_size = actual->buffer_views[i].size_bytes;
+  if (buffer_size > 0) {
+    SET_NOT_EQUAL_AND_RETURN_IF(
+        memcmp(actual->buffer_views[i].data.data, 
expected->buffer_views[i].data.data,
+               buffer_size) != 0,
+        state);
+  }
+}
+
+static void ArrowArrayViewCompareIdentical(const struct ArrowArrayView* actual,
+                                           const struct ArrowArrayView* 
expected,
+                                           struct 
ArrowComparisonInternalState* state) {
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->storage_type != expected->storage_type, 
state);
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->n_children != expected->n_children, 
state);
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary == NULL && 
expected->dictionary != NULL,
+                              state);
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary != NULL && 
expected->dictionary == NULL,
+                              state);
+
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->length != expected->length, state);
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->offset != expected->offset, state);
+  SET_NOT_EQUAL_AND_RETURN_IF(actual->null_count != expected->null_count, 
state);
+
+  for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+    ArrowArrayViewCompareBuffer(actual, expected, i, state);
+    if (!state->is_equal) {
+      ArrowComparePrependPath(state->reason, ".buffers[%d]", i);
+      return;
+    }
+  }
+
+  for (int64_t i = 0; i < actual->n_children; i++) {
+    ArrowArrayViewCompareIdentical(actual->children[i], expected->children[i], 
state);
+    if (!state->is_equal) {
+      ArrowComparePrependPath(state->reason, ".children[%" PRId64 "]", i);
+      return;
+    }
+  }
+
+  if (actual->dictionary != NULL) {
+    ArrowArrayViewCompareIdentical(actual->dictionary, expected->dictionary, 
state);
+    if (!state->is_equal) {
+      ArrowComparePrependPath(state->reason, ".dictionary");
+      return;
+    }
+  }
+}
+
+// Top-level entry point to take care of creating, cleaning up, and
+// propagating the ArrowComparisonInternalState to the caller
+ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual,
+                                     const struct ArrowArrayView* expected,
+                                     enum ArrowCompareLevel level, int* out,
+                                     struct ArrowError* reason) {
+  struct ArrowComparisonInternalState state;
+  state.level = level;
+  state.is_equal = 1;
+  state.reason = reason;
+
+  switch (level) {
+    case NANOARROW_COMPARE_IDENTICAL:
+      ArrowArrayViewCompareIdentical(actual, expected, &state);
+      break;
+    default:
+      return EINVAL;
+  }
+
+  *out = state.is_equal;
+  if (!state.is_equal) {
+    ArrowComparePrependPath(state.reason, "root");
+  }
+
+  return NANOARROW_OK;
+}
+
+#undef SET_NOT_EQUAL_AND_RETURN_IF
+#undef SET_NOT_EQUAL_AND_RETURN_IF_IMPL
diff --git a/src/nanoarrow/common/array_test.cc 
b/src/nanoarrow/common/array_test.cc
index 030c5699..abb49a69 100644
--- a/src/nanoarrow/common/array_test.cc
+++ b/src/nanoarrow/common/array_test.cc
@@ -1816,6 +1816,189 @@ TEST(ArrayTest, ArrayViewTestBasic) {
   ArrowArrayViewReset(&array_view);
 }
 
+TEST(ArrayTest, ArrayViewCompareTestStructure) {
+  struct ArrowError error;
+  struct ArrowArrayView actual;
+  struct ArrowArrayView expected;
+  int is_equal = -1;
+
+  ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
+  ASSERT_EQ(ArrowArrayViewCompare(&actual, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 1);
+
+  // Check non-equal storage type
+  is_equal = -1;
+  ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_STRING);
+  ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root: actual->storage_type != 
expected->storage_type");
+
+  // Check non-equal numbers of children
+  is_equal = -1;
+  ArrowArrayViewReset(&actual);
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_STRUCT);
+  ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_STRUCT);
+  ASSERT_EQ(ArrowArrayViewAllocateChildren(&expected, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root: actual->n_children != 
expected->n_children");
+
+  // Check difference in children
+  is_equal = -1;
+  ASSERT_EQ(ArrowArrayViewAllocateChildren(&actual, 1), NANOARROW_OK);
+  ArrowArrayViewInitFromType(actual.children[0], NANOARROW_TYPE_STRING);
+  ArrowArrayViewInitFromType(expected.children[0], NANOARROW_TYPE_BINARY);
+  ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message,
+               "root.children[0]: actual->storage_type != 
expected->storage_type");
+
+  // Check presence/absence of dictionary
+  is_equal = -1;
+  ArrowArrayViewReset(&actual);
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
+  ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_INT32);
+  ASSERT_EQ(ArrowArrayViewAllocateDictionary(&expected), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message,
+               "root: actual->dictionary == NULL && expected->dictionary != 
NULL");
+
+  is_equal = -1;
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message,
+               "root: actual->dictionary != NULL && expected->dictionary == 
NULL");
+
+  // Check a difference in a dictionary
+  is_equal = -1;
+  ASSERT_EQ(ArrowArrayViewAllocateDictionary(&actual), NANOARROW_OK);
+  ArrowArrayViewInitFromType(actual.dictionary, NANOARROW_TYPE_STRING);
+  ArrowArrayViewInitFromType(expected.dictionary, NANOARROW_TYPE_BINARY);
+  ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message,
+               "root.dictionary: actual->storage_type != 
expected->storage_type");
+
+  ArrowArrayViewReset(&actual);
+  ArrowArrayViewReset(&expected);
+}
+
+TEST(ArrayTest, ArrayViewCompareTestIdentical) {
+  struct ArrowError error;
+  struct ArrowArrayView actual;
+  struct ArrowArrayView expected;
+  int is_equal = -1;
+
+  // Check non-equal length/offset/null count
+  ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
+  ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_INT32);
+  expected.length = 1;
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root: actual->length != expected->length");
+
+  is_equal = -1;
+  expected.length = actual.length;
+  expected.offset = 1;
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root: actual->offset != expected->offset");
+
+  is_equal = -1;
+  expected.offset = actual.offset;
+  expected.null_count = 1;
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root: actual->null_count != 
expected->null_count");
+
+  // Check non-equal buffer size
+  is_equal = -1;
+  expected.null_count = actual.null_count;
+  expected.buffer_views[1].size_bytes = 5;
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message,
+               "root.buffers[1]: actual->buffer_views[i].size_bytes != "
+               "expected->buffer_views[i].size_bytes");
+
+  is_equal = -1;
+  const char* actual_content = "abcde";
+  const char* expected_content = "bcdef";
+  actual.buffer_views[1].size_bytes = 5;
+  actual.buffer_views[1].data.as_char = actual_content;
+  expected.buffer_views[1].data.as_char = expected_content;
+
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message,
+               "root.buffers[1]: memcmp(actual->buffer_views[i].data.data, "
+               "expected->buffer_views[i].data.data, buffer_size) != 0");
+
+  // Check difference in a child
+  is_equal = -1;
+  ArrowArrayViewReset(&actual);
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_STRUCT);
+  ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_STRUCT);
+  ASSERT_EQ(ArrowArrayViewAllocateChildren(&actual, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewAllocateChildren(&expected, 1), NANOARROW_OK);
+  ArrowArrayViewInitFromType(actual.children[0], NANOARROW_TYPE_INT32);
+  ArrowArrayViewInitFromType(expected.children[0], NANOARROW_TYPE_INT32);
+  actual.children[0]->length = 1;
+
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root.children[0]: actual->length != 
expected->length");
+
+  // Check difference in a dictionary
+  is_equal = -1;
+  ArrowArrayViewReset(&actual);
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
+  ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_INT32);
+  ASSERT_EQ(ArrowArrayViewAllocateDictionary(&actual), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewAllocateDictionary(&expected), NANOARROW_OK);
+  actual.dictionary->length = 1;
+
+  ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, 
NANOARROW_COMPARE_IDENTICAL,
+                                  &is_equal, &error),
+            NANOARROW_OK);
+  EXPECT_EQ(is_equal, 0);
+  EXPECT_STREQ(error.message, "root.dictionary: actual->length != 
expected->length");
+
+  ArrowArrayViewReset(&actual);
+  ArrowArrayViewReset(&expected);
+}
+
 TEST(ArrayTest, ArrayViewTestComputeNullCount) {
   struct ArrowError error;
 
diff --git a/src/nanoarrow/common/inline_types.h 
b/src/nanoarrow/common/inline_types.h
index 015cc541..ac513279 100644
--- a/src/nanoarrow/common/inline_types.h
+++ b/src/nanoarrow/common/inline_types.h
@@ -576,6 +576,17 @@ enum ArrowValidationLevel {
   NANOARROW_VALIDATION_LEVEL_FULL = 3
 };
 
+/// \brief Comparison level enumerator
+/// \ingroup nanoarrow-utils
+enum ArrowCompareLevel {
+  /// \brief Consider arrays equal if buffers contain identical content
+  /// and have identical offset, null count, and length. Note that this is
+  /// a much stricter check than logical equality, which would take into
+  /// account potentially different content of null slots, arrays with a
+  /// non-zero offset, and other considerations.
+  NANOARROW_COMPARE_IDENTICAL,
+};
+
 /// \brief Get a string value of an enum ArrowTimeUnit value
 /// \ingroup nanoarrow-utils
 ///
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index d19c8f9b..f65d053a 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -128,6 +128,7 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal)
 #define ArrowArrayViewValidate \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate)
+#define ArrowArrayViewCompare NANOARROW_SYMBOL(NANOARROW_NAMESPACE, 
ArrowArrayViewCompare)
 #define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, 
ArrowArrayViewReset)
 #define ArrowBasicArrayStreamInit \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit)
@@ -1064,6 +1065,19 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
                                       enum ArrowValidationLevel 
validation_level,
                                       struct ArrowError* error);
 
+/// \brief Compare two ArrowArrayView objects for equality
+///
+/// Given two ArrowArrayView instances, place either 0 (not equal) and
+/// 1 (equal) at the address pointed to by out. If the comparison determines
+/// that actual and expected are not equal, a reason will be communicated via
+/// error if error is non-NULL.
+///
+/// Returns NANOARROW_OK if the comparison completed successfully.
+ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual,
+                                     const struct ArrowArrayView* expected,
+                                     enum ArrowCompareLevel level, int* out,
+                                     struct ArrowError* reason);
+
 /// \brief Reset the contents of an ArrowArrayView and frees resources
 void ArrowArrayViewReset(struct ArrowArrayView* array_view);
 

Reply via email to