This is an automated email from the ASF dual-hosted git repository. kou pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 084a36540d GH-46835: [C++] Add more configuration options to arrow::EqualOptions (#47204) 084a36540d is described below commit 084a36540d599235c3352c7b5418c469db3eb2be Author: Arash Andishgar <42874930+andish...@users.noreply.github.com> AuthorDate: Tue Sep 2 12:03:10 2025 +0330 GH-46835: [C++] Add more configuration options to arrow::EqualOptions (#47204) ### Rationale for this change Parameterizing the comparison of `metadata` and `arrow::Schema` via `arrow::EqualOptions`. ### What changes are included in this PR? * Added two attributes to `arrow::EqualOptions` for parameterizing the comparison of `metadata` and `schema`. ### Are these changes tested? Yes, I ran the relevant unit tests. ### Are there any user-facing changes? * Added `use_schema()` to `arrow::EqualOptions`. * Added `use_metadata()` to `arrow::EqualOptions`. * Added a new overload for `arrow::RecordBatch::Equals` that accepts only another `arrow::RecordBatch` and an `arrow::EqualOptions` instance. * GitHub Issue: #46835 Lead-authored-by: Arash Andishgar <arashandishg...@gmail.com> Co-authored-by: Sutou Kouhei <k...@cozmixng.org> Signed-off-by: Sutou Kouhei <k...@clear-code.com> --- cpp/src/arrow/compare.h | 34 +++++++++ cpp/src/arrow/record_batch.cc | 30 ++------ cpp/src/arrow/record_batch.h | 16 ++++- cpp/src/arrow/record_batch_test.cc | 139 ++++++++++++++++++++++++++++--------- docs/source/cpp/api/utilities.rst | 6 ++ 5 files changed, 165 insertions(+), 60 deletions(-) diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 4d2282c982..2198495d7d 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -83,6 +83,38 @@ class EqualOptions { return res; } + /// Whether the \ref arrow::Schema property is used in the comparison. + /// + /// This option only affects the Equals methods + /// and has no effect on ApproxEquals methods. + bool use_schema() const { return use_schema_; } + + /// Return a new EqualOptions object with the "use_schema_" property changed. + /// + /// Setting this option is false making the value of \ref EqualOptions::use_metadata + /// is ignored. + EqualOptions use_schema(bool v) const { + auto res = EqualOptions(*this); + res.use_schema_ = v; + return res; + } + + /// Whether the "metadata" in \ref arrow::Schema is used in the comparison. + /// + /// This option only affects the Equals methods + /// and has no effect on the ApproxEquals methods. + /// + /// Note: This option is only considered when \ref arrow::EqualOptions::use_schema is + /// set to true. + bool use_metadata() const { return use_metadata_; } + + /// Return a new EqualOptions object with the "use_metadata" property changed. + EqualOptions use_metadata(bool v) const { + auto res = EqualOptions(*this); + res.use_metadata_ = v; + return res; + } + /// The ostream to which a diff will be formatted if arrays disagree. /// If this is null (the default) no diff will be formatted. std::ostream* diff_sink() const { return diff_sink_; } @@ -103,6 +135,8 @@ class EqualOptions { bool nans_equal_ = false; bool signed_zeros_equal_ = true; bool use_atol_ = false; + bool use_schema_ = true; + bool use_metadata_ = false; std::ostream* diff_sink_ = NULLPTR; }; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index bc952d3fe2..2429113333 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -35,6 +35,7 @@ #include "arrow/array/statistics.h" #include "arrow/array/validate.h" #include "arrow/c/abi.h" +#include "arrow/compare.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" @@ -349,30 +350,10 @@ bool CanIgnoreNaNInEquality(const RecordBatch& batch, const EqualOptions& opts) bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata, const EqualOptions& opts) const { - if (this == &other) { - if (CanIgnoreNaNInEquality(*this, opts)) { - return true; - } - } else { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { - return false; - } else if (!schema_->Equals(*other.schema(), check_metadata)) { - return false; - } else if (device_type() != other.device_type()) { - return false; - } - } - - for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->Equals(other.column(i), opts)) { - return false; - } - } - - return true; + return Equals(other, opts.use_metadata(check_metadata)); } -bool RecordBatch::ApproxEquals(const RecordBatch& other, const EqualOptions& opts) const { +bool RecordBatch::Equals(const RecordBatch& other, const EqualOptions& opts) const { if (this == &other) { if (CanIgnoreNaNInEquality(*this, opts)) { return true; @@ -380,13 +361,16 @@ bool RecordBatch::ApproxEquals(const RecordBatch& other, const EqualOptions& opt } else { if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { return false; + } else if (opts.use_schema() && + !schema_->Equals(*other.schema(), opts.use_metadata())) { + return false; } else if (device_type() != other.device_type()) { return false; } } for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->ApproxEquals(other.column(i), opts)) { + if (!column(i)->Equals(other.column(i), opts)) { return false; } } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 43a8ee63b5..7a283d5368 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -118,22 +118,32 @@ class ARROW_EXPORT RecordBatch { static Result<std::shared_ptr<RecordBatch>> FromStructArray( const std::shared_ptr<Array>& array, MemoryPool* pool = default_memory_pool()); - /// \brief Determine if two record batches are exactly equal + /// \brief Determine if two record batches are equal /// /// \param[in] other the RecordBatch to compare with - /// \param[in] check_metadata if true, check that Schema metadata is the same + /// \param[in] check_metadata if true, the schema metadata will be compared, + /// regardless of the value set in \ref EqualOptions::use_metadata_ /// \param[in] opts the options for equality comparisons /// \return true if batches are equal bool Equals(const RecordBatch& other, bool check_metadata = false, const EqualOptions& opts = EqualOptions::Defaults()) const; + /// \brief Determine if two record batches are equal + /// + /// \param[in] other the RecordBatch to compare with + /// \param[in] opts the options for equality comparisons + /// \return true if batches are equal + bool Equals(const RecordBatch& other, const EqualOptions& opts) const; + /// \brief Determine if two record batches are approximately equal /// /// \param[in] other the RecordBatch to compare with /// \param[in] opts the options for equality comparisons /// \return true if batches are approximately equal bool ApproxEquals(const RecordBatch& other, - const EqualOptions& opts = EqualOptions::Defaults()) const; + const EqualOptions& opts = EqualOptions::Defaults()) const { + return Equals(other, opts.use_schema(false).use_atol(true)); + } /// \return the record batch's schema const std::shared_ptr<Schema>& schema() const { return schema_; } diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 18b21e2721..4fec2ee686 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -40,6 +40,7 @@ #include "arrow/array/util.h" #include "arrow/c/abi.h" #include "arrow/chunked_array.h" +#include "arrow/compare.h" #include "arrow/config.h" #include "arrow/status.h" #include "arrow/table.h" @@ -64,6 +65,50 @@ class TestRecordBatch : public ::testing::Test {}; TEST_F(TestRecordBatch, Equals) { const int length = 10; + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int16()); + + auto schema = ::arrow::schema({f0, f1, f2}); + auto schema_same = ::arrow::schema({f0, f1, f2}); + auto schema_fewer_fields = ::arrow::schema({f0, f1}); + + random::RandomArrayGenerator gen(42); + + auto a_f0 = gen.ArrayOf(int32(), length); + auto a_f1 = gen.ArrayOf(uint8(), length); + auto a_f2 = gen.ArrayOf(int16(), length); + auto a_f0_half = a_f0->Slice(0, length / 2); + auto a_f1_half = a_f1->Slice(0, length / 2); + auto a_f0_different = gen.ArrayOf(int32(), length); + auto a_f1_different = gen.ArrayOf(uint8(), length); + + auto b = RecordBatch::Make(schema, length, {a_f0, a_f1, a_f2}); + auto b_same = RecordBatch::Make(schema_same, length, {a_f0, a_f1, a_f2}); + auto b_fewer_fields = RecordBatch::Make(schema_fewer_fields, length, {a_f0, a_f1}); + auto b_fewer_fields_half = + RecordBatch::Make(schema_fewer_fields, length / 2, {a_f0_half, a_f1_half}); + auto b_fewer_fields_different = + RecordBatch::Make(schema_fewer_fields, length, {a_f0_different, a_f1_different}); + + // Same Values + ASSERT_TRUE(b->Equals(*b_same)); + + // Different number of columns + ASSERT_FALSE(b->Equals(*b_fewer_fields)); + + // Different number of rows + ASSERT_FALSE(b_fewer_fields->Equals(*b_fewer_fields_half)); + + // Different values + ASSERT_FALSE(b_fewer_fields->Equals(*b_fewer_fields_different)); +} + +class TestRecordBatchEqualOptions : public TestRecordBatch {}; + +TEST_F(TestRecordBatchEqualOptions, MetadataAndSchema) { + int length = 10; + auto f0 = field("f0", int32()); auto f1 = field("f1", uint8()); auto f2 = field("f2", int16()); @@ -71,37 +116,49 @@ TEST_F(TestRecordBatch, Equals) { auto metadata = key_value_metadata({"foo"}, {"bar"}); - std::vector<std::shared_ptr<Field>> fields = {f0, f1, f2}; auto schema = ::arrow::schema({f0, f1, f2}); - auto schema2 = ::arrow::schema({f0, f1}); - auto schema3 = ::arrow::schema({f0, f1, f2}, metadata); - auto schema4 = ::arrow::schema({f0, f1, f2b}); + auto schema_with_metadata = ::arrow::schema({f0, f1, f2}, metadata); + auto schema_renamed_field = ::arrow::schema({f0, f1, f2b}); random::RandomArrayGenerator gen(42); - auto a0 = gen.ArrayOf(int32(), length); - auto a1 = gen.ArrayOf(uint8(), length); - auto a2 = gen.ArrayOf(int16(), length); + auto a_f0 = gen.ArrayOf(int32(), length); + auto a_f1 = gen.ArrayOf(uint8(), length); + auto a_f2 = gen.ArrayOf(int16(), length); + auto a_f2b = a_f2; - auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - auto b2 = RecordBatch::Make(schema3, length, {a0, a1, a2}); - auto b3 = RecordBatch::Make(schema2, length, {a0, a1}); - auto b4 = RecordBatch::Make(schema, length, {a0, a1, a1}); - auto b5 = RecordBatch::Make(schema4, length, {a0, a1, a2}); + // All RecordBatches have the same values but different schemas. + auto b = RecordBatch::Make(schema, length, {a_f0, a_f1, a_f2}); + auto b_with_metadata = + RecordBatch::Make(schema_with_metadata, length, {a_f0, a_f1, a_f2}); + auto b_renamed_field = + RecordBatch::Make(schema_renamed_field, length, {a_f0, a_f1, a_f2b}); - ASSERT_TRUE(b1->Equals(*b1)); - ASSERT_FALSE(b1->Equals(*b3)); - ASSERT_FALSE(b1->Equals(*b4)); + auto options = EqualOptions::Defaults(); // Same values and types, but different field names - ASSERT_FALSE(b1->Equals(*b5)); + ASSERT_FALSE(b->Equals(*b_renamed_field)); + ASSERT_TRUE(b->Equals(*b_renamed_field, options.use_schema(false))); + ASSERT_TRUE(b->ApproxEquals(*b_renamed_field)); + ASSERT_TRUE(b->ApproxEquals(*b_renamed_field, options.use_schema(true))); // Different metadata - ASSERT_TRUE(b1->Equals(*b2)); - ASSERT_FALSE(b1->Equals(*b2, /*check_metadata=*/true)); + ASSERT_TRUE(b->Equals(*b_with_metadata)); + ASSERT_TRUE(b->Equals(*b_with_metadata, options)); + ASSERT_FALSE(b->Equals(*b_with_metadata, + /*check_metadata=*/true)); + ASSERT_FALSE(b->Equals(*b_with_metadata, + /*check_metadata=*/true, options.use_schema(true))); + ASSERT_TRUE(b->Equals(*b_with_metadata, + /*check_metadata=*/true, options.use_schema(false))); + ASSERT_TRUE(b->Equals(*b_with_metadata, options.use_schema(true).use_metadata(false))); + ASSERT_FALSE(b->Equals(*b_with_metadata, options.use_schema(true).use_metadata(true))); + ASSERT_TRUE(b->Equals(*b_with_metadata, options.use_schema(false).use_metadata(true))); + ASSERT_TRUE( + b->ApproxEquals(*b_with_metadata, options.use_schema(true).use_metadata(true))); } -TEST_F(TestRecordBatch, EqualOptions) { +TEST_F(TestRecordBatchEqualOptions, NaN) { int length = 2; auto f = field("f", float64()); @@ -114,13 +171,27 @@ TEST_F(TestRecordBatch, EqualOptions) { auto b1 = RecordBatch::Make(schema, length, {array1}); auto b2 = RecordBatch::Make(schema, length, {array2}); - EXPECT_FALSE(b1->Equals(*b2, /*check_metadata=*/false, - EqualOptions::Defaults().nans_equal(false))); - EXPECT_TRUE(b1->Equals(*b2, /*check_metadata=*/false, - EqualOptions::Defaults().nans_equal(true))); + EXPECT_FALSE(b1->Equals(*b2, EqualOptions::Defaults().nans_equal(false))); + EXPECT_TRUE(b1->Equals(*b2, EqualOptions::Defaults().nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualOptions, SignedZero) { + int length = 2; + auto f = field("f", float64()); + + auto schema = ::arrow::schema({f}); + + std::shared_ptr<Array> array1, array2; + ArrayFromVector<DoubleType>(float64(), {true, true}, {0.5, +0.0}, &array1); + ArrayFromVector<DoubleType>(float64(), {true, true}, {0.5, -0.0}, &array2); + auto b1 = RecordBatch::Make(schema, length, {array1}); + auto b2 = RecordBatch::Make(schema, length, {array2}); + + ASSERT_FALSE(b1->Equals(*b2, EqualOptions::Defaults().signed_zeros_equal(false))); + ASSERT_TRUE(b1->Equals(*b2, EqualOptions::Defaults().signed_zeros_equal(true))); } -TEST_F(TestRecordBatch, ApproxEqualOptions) { +TEST_F(TestRecordBatchEqualOptions, Approx) { int length = 2; auto f = field("f", float64()); @@ -137,8 +208,8 @@ TEST_F(TestRecordBatch, ApproxEqualOptions) { EXPECT_FALSE(b1->ApproxEquals(*b2, EqualOptions::Defaults().nans_equal(true))); auto options = EqualOptions::Defaults().nans_equal(true).atol(0.1); - EXPECT_FALSE(b1->Equals(*b2, false, options)); - EXPECT_TRUE(b1->Equals(*b2, false, options.use_atol(true))); + EXPECT_FALSE(b1->Equals(*b2, options)); + EXPECT_TRUE(b1->Equals(*b2, options.use_atol(true))); EXPECT_TRUE(b1->ApproxEquals(*b2, options)); } @@ -158,8 +229,8 @@ TEST_F(TestRecordBatchEqualsSameAddress, NonFloatType) { auto options = EqualOptions::Defaults(); - ASSERT_TRUE(b0->Equals(*b1, true, options)); - ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + ASSERT_TRUE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); ASSERT_TRUE(b0->ApproxEquals(*b1, options)); ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); @@ -180,8 +251,8 @@ TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithoutFloatType) { auto options = EqualOptions::Defaults(); - ASSERT_TRUE(b0->Equals(*b1, true, options)); - ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + ASSERT_TRUE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); ASSERT_TRUE(b0->ApproxEquals(*b1, options)); ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); @@ -201,8 +272,8 @@ TEST_F(TestRecordBatchEqualsSameAddress, FloatType) { auto options = EqualOptions::Defaults(); - ASSERT_FALSE(b0->Equals(*b1, true, options)); - ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + ASSERT_FALSE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); ASSERT_FALSE(b0->ApproxEquals(*b1, options)); ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); @@ -223,8 +294,8 @@ TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithFloatType) { auto options = EqualOptions::Defaults(); - ASSERT_FALSE(b0->Equals(*b1, true, options)); - ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true))); + ASSERT_FALSE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); ASSERT_FALSE(b0->ApproxEquals(*b1, options)); ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); diff --git a/docs/source/cpp/api/utilities.rst b/docs/source/cpp/api/utilities.rst index 5ce3d7d1a4..b8d542f481 100644 --- a/docs/source/cpp/api/utilities.rst +++ b/docs/source/cpp/api/utilities.rst @@ -43,6 +43,12 @@ Iterators .. doxygenclass:: arrow::VectorIterator :members: +Comparison +========== + +.. doxygenclass:: arrow::EqualOptions + :members: + Compression ===========