This is an automated email from the ASF dual-hosted git repository. kou pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 607be64e81 GH-46939: [C++] Add support for shared memory comparison in arrow::RecordBatch (#47149) 607be64e81 is described below commit 607be64e81a1788d1d7a679b957481f8734e3ee5 Author: Arash Andishgar <42874930+andish...@users.noreply.github.com> AuthorDate: Tue Jul 22 15:12:16 2025 +0330 GH-46939: [C++] Add support for shared memory comparison in arrow::RecordBatch (#47149) ### Rationale for this change Create a fast path for comparing `arrow::RecordBatch `instances that share the same memory. ### What changes are included in this PR? Enable fast comparison for `arrow::RecordBatch `objects backed by the same memory. ### Are these changes tested? Yes, I ran the relevant unit tests. ### Are there any user-facing changes? No. * GitHub Issue: #46939 Authored-by: Arash Andishgar <arashandishg...@gmail.com> Signed-off-by: Sutou Kouhei <k...@clear-code.com> --- cpp/src/arrow/record_batch.cc | 69 ++++++++++++++++++++++++------ cpp/src/arrow/record_batch_test.cc | 88 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 04d6890d39..941fd3f002 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -310,19 +310,58 @@ const std::string& RecordBatch::column_name(int i) const { return schema_->field(i)->name(); } -bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata, - const EqualOptions& opts) const { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { - return false; +namespace { + +bool ContainFloatType(const std::shared_ptr<DataType>& type) { + if (is_floating(type->id())) { + return true; } - if (!schema_->Equals(*other.schema(), check_metadata)) { - return false; + for (const auto& field : type->fields()) { + if (ContainFloatType(field->type())) { + return true; + } } - if (device_type() != other.device_type()) { + return false; +} + +bool ContainFloatType(const Schema& schema) { + for (auto& field : schema.fields()) { + if (ContainFloatType(field->type())) { + return true; + } + } + return false; +} + +bool CanIgnoreNaNInEquality(const RecordBatch& batch, const EqualOptions& opts) { + if (opts.nans_equal()) { + return true; + } else if (!ContainFloatType(*batch.schema())) { + return true; + } else { return false; } +} + +} // namespace + +bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata, + const EqualOptions& opts) const { + if (this == &other) { + if (CanIgnoreNaNInEquality(*this, opts)) { + return true; + } + } else { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } else if (!schema_->Equals(*other.schema(), check_metadata)) { + return false; + } else if (device_type() != other.device_type()) { + return false; + } + } for (int i = 0; i < num_columns(); ++i) { if (!column(i)->Equals(other.column(i), opts)) { @@ -334,12 +373,16 @@ bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata, } bool RecordBatch::ApproxEquals(const RecordBatch& other, const EqualOptions& opts) const { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { - return false; - } - - if (device_type() != other.device_type()) { - return false; + if (this == &other) { + if (CanIgnoreNaNInEquality(*this, opts)) { + return true; + } + } else { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } else if (device_type() != other.device_type()) { + return false; + } } for (int i = 0; i < num_columns(); ++i) { diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index fab8137171..156d083828 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -142,6 +142,94 @@ TEST_F(TestRecordBatch, ApproxEqualOptions) { EXPECT_TRUE(b1->ApproxEquals(*b2, options)); } +class TestRecordBatchEqualsSameAddress : public TestRecordBatch {}; + +TEST_F(TestRecordBatchEqualsSameAddress, NonFloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int64()); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON(f1->type(), "[0, 1, 2]"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(b0->Equals(*b1, true, options)); + ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + + ASSERT_TRUE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithoutFloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", int8()}})); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON( + f1->type(), R"([{"f2": 1, "f3": 4}, {"f2": 2, "f3": 5}, {"f2":3, "f3": 6}])"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(b0->Equals(*b1, true, options)); + ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + + ASSERT_TRUE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualsSameAddress, FloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", float64()); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON(f1->type(), "[0.0, 1.0, 2.0, NaN]"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(b0->Equals(*b1, true, options)); + ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + + ASSERT_FALSE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithFloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", float32()}})); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON( + f1->type(), R"([{"f2": 1, "f3": 4.0}, {"f2": 2, "f3": 4.0}, {"f2":3, "f3": NaN}])"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(b0->Equals(*b1, true, options)); + ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + + ASSERT_FALSE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + TEST_F(TestRecordBatch, Validate) { const int length = 10;