This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 607be64e81 GH-46939: [C++] Add support for shared memory comparison in 
arrow::RecordBatch  (#47149)
607be64e81 is described below

commit 607be64e81a1788d1d7a679b957481f8734e3ee5
Author: Arash Andishgar <42874930+andish...@users.noreply.github.com>
AuthorDate: Tue Jul 22 15:12:16 2025 +0330

    GH-46939: [C++] Add support for shared memory comparison in 
arrow::RecordBatch  (#47149)
    
    
    ### Rationale for this change
    
    Create a fast path for comparing `arrow::RecordBatch `instances that share 
the same memory.
    
    ### What changes are included in this PR?
    
    Enable fast comparison for `arrow::RecordBatch `objects backed by the same 
memory.
    
    ### Are these changes tested?
    
    Yes, I ran the relevant unit tests.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #46939
    
    Authored-by: Arash Andishgar <arashandishg...@gmail.com>
    Signed-off-by: Sutou Kouhei <k...@clear-code.com>
---
 cpp/src/arrow/record_batch.cc      | 69 ++++++++++++++++++++++++------
 cpp/src/arrow/record_batch_test.cc | 88 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 04d6890d39..941fd3f002 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -310,19 +310,58 @@ const std::string& RecordBatch::column_name(int i) const {
   return schema_->field(i)->name();
 }
 
-bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata,
-                         const EqualOptions& opts) const {
-  if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
-    return false;
+namespace {
+
+bool ContainFloatType(const std::shared_ptr<DataType>& type) {
+  if (is_floating(type->id())) {
+    return true;
   }
 
-  if (!schema_->Equals(*other.schema(), check_metadata)) {
-    return false;
+  for (const auto& field : type->fields()) {
+    if (ContainFloatType(field->type())) {
+      return true;
+    }
   }
 
-  if (device_type() != other.device_type()) {
+  return false;
+}
+
+bool ContainFloatType(const Schema& schema) {
+  for (auto& field : schema.fields()) {
+    if (ContainFloatType(field->type())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CanIgnoreNaNInEquality(const RecordBatch& batch, const EqualOptions& 
opts) {
+  if (opts.nans_equal()) {
+    return true;
+  } else if (!ContainFloatType(*batch.schema())) {
+    return true;
+  } else {
     return false;
   }
+}
+
+}  // namespace
+
+bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata,
+                         const EqualOptions& opts) const {
+  if (this == &other) {
+    if (CanIgnoreNaNInEquality(*this, opts)) {
+      return true;
+    }
+  } else {
+    if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) 
{
+      return false;
+    } else if (!schema_->Equals(*other.schema(), check_metadata)) {
+      return false;
+    } else if (device_type() != other.device_type()) {
+      return false;
+    }
+  }
 
   for (int i = 0; i < num_columns(); ++i) {
     if (!column(i)->Equals(other.column(i), opts)) {
@@ -334,12 +373,16 @@ bool RecordBatch::Equals(const RecordBatch& other, bool 
check_metadata,
 }
 
 bool RecordBatch::ApproxEquals(const RecordBatch& other, const EqualOptions& 
opts) const {
-  if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
-    return false;
-  }
-
-  if (device_type() != other.device_type()) {
-    return false;
+  if (this == &other) {
+    if (CanIgnoreNaNInEquality(*this, opts)) {
+      return true;
+    }
+  } else {
+    if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) 
{
+      return false;
+    } else if (device_type() != other.device_type()) {
+      return false;
+    }
   }
 
   for (int i = 0; i < num_columns(); ++i) {
diff --git a/cpp/src/arrow/record_batch_test.cc 
b/cpp/src/arrow/record_batch_test.cc
index fab8137171..156d083828 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -142,6 +142,94 @@ TEST_F(TestRecordBatch, ApproxEqualOptions) {
   EXPECT_TRUE(b1->ApproxEquals(*b2, options));
 }
 
+class TestRecordBatchEqualsSameAddress : public TestRecordBatch {};
+
+TEST_F(TestRecordBatchEqualsSameAddress, NonFloatType) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int64());
+
+  auto schema = ::arrow::schema({f0, f1});
+
+  auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+  auto a1 = ArrayFromJSON(f1->type(), "[0, 1, 2]");
+
+  auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+  auto b1 = b0;
+
+  auto options = EqualOptions::Defaults();
+
+  ASSERT_TRUE(b0->Equals(*b1, true, options));
+  ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+  ASSERT_TRUE(b0->ApproxEquals(*b1, options));
+  ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
+TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithoutFloatType) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", int8()}}));
+
+  auto schema = ::arrow::schema({f0, f1});
+
+  auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+  auto a1 = ArrayFromJSON(
+      f1->type(), R"([{"f2": 1, "f3": 4}, {"f2": 2, "f3": 5}, {"f2":3, "f3": 
6}])");
+
+  auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+  auto b1 = b0;
+
+  auto options = EqualOptions::Defaults();
+
+  ASSERT_TRUE(b0->Equals(*b1, true, options));
+  ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+  ASSERT_TRUE(b0->ApproxEquals(*b1, options));
+  ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
+TEST_F(TestRecordBatchEqualsSameAddress, FloatType) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", float64());
+
+  auto schema = ::arrow::schema({f0, f1});
+
+  auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+  auto a1 = ArrayFromJSON(f1->type(), "[0.0, 1.0, 2.0, NaN]");
+
+  auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+  auto b1 = b0;
+
+  auto options = EqualOptions::Defaults();
+
+  ASSERT_FALSE(b0->Equals(*b1, true, options));
+  ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+  ASSERT_FALSE(b0->ApproxEquals(*b1, options));
+  ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
+TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithFloatType) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", float32()}}));
+
+  auto schema = ::arrow::schema({f0, f1});
+
+  auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+  auto a1 = ArrayFromJSON(
+      f1->type(), R"([{"f2": 1, "f3": 4.0}, {"f2": 2, "f3": 4.0}, {"f2":3, 
"f3": NaN}])");
+
+  auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+  auto b1 = b0;
+
+  auto options = EqualOptions::Defaults();
+
+  ASSERT_FALSE(b0->Equals(*b1, true, options));
+  ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+  ASSERT_FALSE(b0->ApproxEquals(*b1, options));
+  ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
 TEST_F(TestRecordBatch, Validate) {
   const int length = 10;
 

Reply via email to