Repository: arrow Updated Branches: refs/heads/master 4226adfbc -> 7ac320bde
ARROW-519: [C++] Refactor array comparison code into a compare.h / compare.cc in part to resolve Xcode 6.1 linker issue This should also pave the way for more user-friendly reporting of "why are the arrays not equal" per ARROW-517 Author: Wes McKinney <[email protected]> Closes #308 from wesm/ARROW-519 and squashes the following commits: 85b0bf8 [Wes McKinney] Fix invalid memory access when doing RangeEquals on BinaryArray with all empty strings f5f4593 [Wes McKinney] Remove unused function in pandas.cc. Fix Binary RangeEquals for arrays of length-0 strings 2118ef4 [Wes McKinney] cpplint, compiler warnings ad54cc6 [Wes McKinney] Remove unneeded ARROW_EXPORT 342a8e6 [Wes McKinney] Refactor array comparison code into a compare.h header and compilation unit. Use visitor pattern. Also may resolve Xcode bug reported in ARROW-519 Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7ac320bd Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7ac320bd Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7ac320bd Branch: refs/heads/master Commit: 7ac320bde52ae47007dadac7398e22a203c6a48d Parents: 4226adf Author: Wes McKinney <[email protected]> Authored: Sun Jan 29 21:27:17 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Sun Jan 29 21:27:17 2017 -0500 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 1 + cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array-primitive-test.cc | 4 +- cpp/src/arrow/array-string-test.cc | 48 ++- cpp/src/arrow/array.cc | 334 ++----------------- cpp/src/arrow/array.h | 145 +------- cpp/src/arrow/compare.cc | 516 +++++++++++++++++++++++++++++ cpp/src/arrow/compare.h | 46 +++ cpp/src/arrow/util/macros.h | 2 + python/CMakeLists.txt | 3 + python/src/pyarrow/adapters/pandas.cc | 8 - 11 files changed, 641 insertions(+), 467 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a0f89f3..ff2c1a6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -771,6 +771,7 @@ set(ARROW_SRCS src/arrow/buffer.cc src/arrow/builder.cc src/arrow/column.cc + src/arrow/compare.cc src/arrow/memory_pool.cc src/arrow/pretty_print.cc src/arrow/schema.cc http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index e5e36ed..b002bb7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -20,6 +20,7 @@ install(FILES api.h array.h column.h + compare.h buffer.h builder.h memory_pool.h http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/array-primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc index 443abac..c839fb9 100644 --- a/cpp/src/arrow/array-primitive-test.cc +++ b/cpp/src/arrow/array-primitive-test.cc @@ -135,7 +135,7 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(nullptr, builder->data()); ASSERT_EQ(ex_null_count, result->null_count()); - ASSERT_TRUE(result->EqualsExact(*expected.get())); + ASSERT_TRUE(result->Equals(*expected)); } protected: @@ -238,7 +238,7 @@ void TestPrimitiveBuilder<PBoolean>::Check( bool actual = BitUtil::GetBit(result->raw_data(), i); ASSERT_EQ(static_cast<bool>(draws_[i]), actual) << i; } - ASSERT_TRUE(result->EqualsExact(*expected.get())); + ASSERT_TRUE(result->Equals(*expected)); } typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16, http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/array-string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index 024bfd5..5ea384a 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -51,7 +51,7 @@ TEST(TypesTest, TestStringType) { // ---------------------------------------------------------------------- // String container -class TestStringContainer : public ::testing::Test { +class TestStringArray : public ::testing::Test { public: void SetUp() { chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; @@ -90,20 +90,20 @@ class TestStringContainer : public ::testing::Test { std::shared_ptr<StringArray> strings_; }; -TEST_F(TestStringContainer, TestArrayBasics) { +TEST_F(TestStringArray, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); ASSERT_OK(strings_->Validate()); } -TEST_F(TestStringContainer, TestType) { +TEST_F(TestStringArray, TestType) { TypePtr type = strings_->type(); ASSERT_EQ(Type::STRING, type->type); ASSERT_EQ(Type::STRING, strings_->type_enum()); } -TEST_F(TestStringContainer, TestListFunctions) { +TEST_F(TestStringArray, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { ASSERT_EQ(pos, strings_->value_offset(i)); @@ -112,12 +112,12 @@ TEST_F(TestStringContainer, TestListFunctions) { } } -TEST_F(TestStringContainer, TestDestructor) { +TEST_F(TestStringArray, TestDestructor) { auto arr = std::make_shared<StringArray>( length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } -TEST_F(TestStringContainer, TestGetString) { +TEST_F(TestStringArray, TestGetString) { for (size_t i = 0; i < expected_.size(); ++i) { if (valid_bytes_[i] == 0) { ASSERT_TRUE(strings_->IsNull(i)); @@ -127,7 +127,7 @@ TEST_F(TestStringContainer, TestGetString) { } } -TEST_F(TestStringContainer, TestEmptyStringComparison) { +TEST_F(TestStringArray, TestEmptyStringComparison) { offsets_ = {0, 0, 0, 0, 0, 0}; offsets_buf_ = test::GetBufferFromVector(offsets_); length_ = offsets_.size() - 1; @@ -212,7 +212,7 @@ TEST_F(TestStringBuilder, TestZeroLength) { // Binary container type // TODO(emkornfield) there should be some way to refactor these to avoid code duplicating // with String -class TestBinaryContainer : public ::testing::Test { +class TestBinaryArray : public ::testing::Test { public: void SetUp() { chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; @@ -252,20 +252,20 @@ class TestBinaryContainer : public ::testing::Test { std::shared_ptr<BinaryArray> strings_; }; -TEST_F(TestBinaryContainer, TestArrayBasics) { +TEST_F(TestBinaryArray, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); ASSERT_OK(strings_->Validate()); } -TEST_F(TestBinaryContainer, TestType) { +TEST_F(TestBinaryArray, TestType) { TypePtr type = strings_->type(); ASSERT_EQ(Type::BINARY, type->type); ASSERT_EQ(Type::BINARY, strings_->type_enum()); } -TEST_F(TestBinaryContainer, TestListFunctions) { +TEST_F(TestBinaryArray, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { ASSERT_EQ(pos, strings_->value_offset(i)); @@ -274,12 +274,12 @@ TEST_F(TestBinaryContainer, TestListFunctions) { } } -TEST_F(TestBinaryContainer, TestDestructor) { +TEST_F(TestBinaryArray, TestDestructor) { auto arr = std::make_shared<BinaryArray>( length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } -TEST_F(TestBinaryContainer, TestGetValue) { +TEST_F(TestBinaryArray, TestGetValue) { for (size_t i = 0; i < expected_.size(); ++i) { if (valid_bytes_[i] == 0) { ASSERT_TRUE(strings_->IsNull(i)); @@ -291,6 +291,28 @@ TEST_F(TestBinaryContainer, TestGetValue) { } } +TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { + BinaryBuilder builder(default_memory_pool(), arrow::binary()); + + std::string empty_string(""); + + builder.Append(empty_string); + builder.Append(empty_string); + builder.Append(empty_string); + builder.Append(empty_string); + builder.Append(empty_string); + + std::shared_ptr<Array> left_arr; + ASSERT_OK(builder.Finish(&left_arr)); + + const BinaryArray& left = static_cast<const BinaryArray&>(*left_arr); + std::shared_ptr<Array> right = std::make_shared<BinaryArray>( + left.length(), left.offsets(), nullptr, left.null_count(), left.null_bitmap()); + + ASSERT_TRUE(left.Equals(right)); + ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); +} + class TestBinaryBuilder : public TestBuilder { public: void SetUp() { http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index aa4a692..6fc7fb6 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -22,6 +22,7 @@ #include <sstream> #include "arrow/buffer.h" +#include "arrow/compare.h" #include "arrow/status.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -51,43 +52,42 @@ Array::Array(const std::shared_ptr<DataType>& type, int32_t length, int32_t null if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); } } -bool Array::BaseEquals(const std::shared_ptr<Array>& other) const { - if (this == other.get()) { return true; } - if (!other) { return false; } - return EqualsExact(*other.get()); +bool Array::Equals(const Array& arr) const { + bool are_equal = false; + Status error = ArrayEquals(*this, arr, &are_equal); + if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } + return are_equal; } -bool Array::EqualsExact(const Array& other) const { - if (this == &other) { return true; } - if (length_ != other.length_ || null_count_ != other.null_count_ || - type_enum() != other.type_enum()) { - return false; - } - if (null_count_ > 0) { - return null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); - } - return true; +bool Array::Equals(const std::shared_ptr<Array>& arr) const { + if (!arr) { return false; } + return Equals(*arr); } -bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const { - return Equals(arr); +bool Array::ApproxEquals(const Array& arr) const { + bool are_equal = false; + Status error = ArrayApproxEquals(*this, arr, &are_equal); + if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } + return are_equal; } -Status Array::Validate() const { - return Status::OK(); -} - -bool NullArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (Type::NA != arr->type_enum()) { return false; } - return arr->length() == length_; +bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const { + if (!arr) { return false; } + return ApproxEquals(*arr); } -bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, +bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const std::shared_ptr<Array>& arr) const { if (!arr) { return false; } - if (Type::NA != arr->type_enum()) { return false; } - return true; + bool are_equal = false; + Status error = + ArrayRangeEquals(*this, *arr, start_idx, end_idx, other_start_idx, &are_equal); + if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } + return are_equal; +} + +Status Array::Validate() const { + return Status::OK(); } Status NullArray::Accept(ArrayVisitor* visitor) const { @@ -105,36 +105,6 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t le raw_data_ = data == nullptr ? nullptr : data_->data(); } -bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { - if (!Array::EqualsExact(other)) { return false; } - - if (null_count_ > 0) { - const uint8_t* this_data = raw_data_; - const uint8_t* other_data = other.raw_data_; - - auto size_meta = dynamic_cast<const FixedWidthType*>(type_.get()); - int value_byte_size = size_meta->bit_width() / 8; - DCHECK_GT(value_byte_size, 0); - - for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && memcmp(this_data, other_data, value_byte_size)) { return false; } - this_data += value_byte_size; - other_data += value_byte_size; - } - return true; - } else { - if (length_ == 0 && other.length_ == 0) { return true; } - return data_->Equals(*other.data_, length_); - } -} - -bool PrimitiveArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - return EqualsExact(static_cast<const PrimitiveArray&>(*arr.get())); -} - template <typename T> Status NumericArray<T>::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); @@ -150,6 +120,7 @@ template class NumericArray<Int32Type>; template class NumericArray<Int64Type>; template class NumericArray<TimestampType>; template class NumericArray<DateType>; +template class NumericArray<TimeType>; template class NumericArray<HalfFloatType>; template class NumericArray<FloatType>; template class NumericArray<DoubleType>; @@ -167,50 +138,6 @@ BooleanArray::BooleanArray(const std::shared_ptr<DataType>& type, int32_t length const std::shared_ptr<Buffer>& null_bitmap) : PrimitiveArray(type, length, data, null_count, null_bitmap) {} -bool BooleanArray::EqualsExact(const BooleanArray& other) const { - if (this == &other) return true; - if (null_count_ != other.null_count_) { return false; } - - if (null_count_ > 0) { - bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); - if (!equal_bitmap) { return false; } - - const uint8_t* this_data = raw_data_; - const uint8_t* other_data = other.raw_data_; - - for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && BitUtil::GetBit(this_data, i) != BitUtil::GetBit(other_data, i)) { - return false; - } - } - return true; - } else { - return data_->Equals(*other.data_, BitUtil::BytesForBits(length_)); - } -} - -bool BooleanArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) return true; - if (Type::BOOL != arr->type_enum()) { return false; } - return EqualsExact(static_cast<const BooleanArray&>(*arr.get())); -} - -bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx, - int32_t other_start_idx, const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - const auto other = static_cast<BooleanArray*>(arr.get()); - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { - const bool is_null = IsNull(i); - if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) { - return false; - } - } - return true; -} - Status BooleanArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } @@ -218,56 +145,6 @@ Status BooleanArray::Accept(ArrayVisitor* visitor) const { // ---------------------------------------------------------------------- // ListArray -bool ListArray::EqualsExact(const ListArray& other) const { - if (this == &other) { return true; } - if (null_count_ != other.null_count_) { return false; } - - bool equal_offsets = - offsets_buffer_->Equals(*other.offsets_buffer_, (length_ + 1) * sizeof(int32_t)); - if (!equal_offsets) { return false; } - bool equal_null_bitmap = true; - if (null_count_ > 0) { - equal_null_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); - } - - if (!equal_null_bitmap) { return false; } - - return values()->Equals(other.values()); -} - -bool ListArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (this->type_enum() != arr->type_enum()) { return false; } - return EqualsExact(static_cast<const ListArray&>(*arr.get())); -} - -bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - const auto other = static_cast<ListArray*>(arr.get()); - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { - const bool is_null = IsNull(i); - if (is_null != arr->IsNull(o_i)) { return false; } - if (is_null) continue; - const int32_t begin_offset = offset(i); - const int32_t end_offset = offset(i + 1); - const int32_t other_begin_offset = other->offset(o_i); - const int32_t other_end_offset = other->offset(o_i + 1); - // Underlying can't be equal if the size isn't equal - if (end_offset - begin_offset != other_end_offset - other_begin_offset) { - return false; - } - if (!values_->RangeEquals( - begin_offset, end_offset, other_begin_offset, other->values())) { - return false; - } - } - return true; -} - Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } if (!offsets_buffer_) { return Status::Invalid("offsets_buffer_ was null"); } @@ -350,51 +227,6 @@ Status BinaryArray::Validate() const { return Status::OK(); } -bool BinaryArray::EqualsExact(const BinaryArray& other) const { - if (!Array::EqualsExact(other)) { return false; } - - bool equal_offsets = - offsets_buffer_->Equals(*other.offsets_buffer_, (length_ + 1) * sizeof(int32_t)); - if (!equal_offsets) { return false; } - - if (!data_buffer_ && !(other.data_buffer_)) { return true; } - - return data_buffer_->Equals(*other.data_buffer_, raw_offsets()[length_]); -} - -bool BinaryArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (this->type_enum() != arr->type_enum()) { return false; } - return EqualsExact(static_cast<const BinaryArray&>(*arr.get())); -} - -bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - const auto other = static_cast<const BinaryArray*>(arr.get()); - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { - const bool is_null = IsNull(i); - if (is_null != arr->IsNull(o_i)) { return false; } - if (is_null) continue; - const int32_t begin_offset = offset(i); - const int32_t end_offset = offset(i + 1); - const int32_t other_begin_offset = other->offset(o_i); - const int32_t other_end_offset = other->offset(o_i + 1); - // Underlying can't be equal if the size isn't equal - if (end_offset - begin_offset != other_end_offset - other_begin_offset) { - return false; - } - - if (std::memcmp(data_ + begin_offset, other->data_ + other_begin_offset, - end_offset - begin_offset)) { - return false; - } - } - return true; -} - Status BinaryArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } @@ -421,36 +253,6 @@ std::shared_ptr<Array> StructArray::field(int32_t pos) const { return field_arrays_[pos]; } -bool StructArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - if (null_count_ != arr->null_count()) { return false; } - return RangeEquals(0, length_, 0, arr); -} - -bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (Type::STRUCT != arr->type_enum()) { return false; } - const auto& other = static_cast<const StructArray&>(*arr.get()); - - bool equal_fields = true; - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { - if (IsNull(i) != arr->IsNull(o_i)) { return false; } - if (IsNull(i)) continue; - for (size_t j = 0; j < field_arrays_.size(); ++j) { - // TODO: really we should be comparing stretches of non-null data rather - // than looking at one value at a time. - equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other.field(j)); - if (!equal_fields) { return false; } - } - } - - return true; -} - Status StructArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } @@ -511,67 +313,6 @@ std::shared_ptr<Array> UnionArray::child(int32_t pos) const { return children_[pos]; } -bool UnionArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (!this->type_->Equals(arr->type())) { return false; } - if (null_count_ != arr->null_count()) { return false; } - return RangeEquals(0, length_, 0, arr); -} - -bool UnionArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (Type::UNION != arr->type_enum()) { return false; } - const auto& other = static_cast<const UnionArray&>(*arr.get()); - - const UnionMode union_mode = mode(); - if (union_mode != other.mode()) { return false; } - - // Define a mapping from the type id to child number - const auto& type_codes = static_cast<const UnionType&>(*arr->type().get()).type_ids; - uint8_t max_code = 0; - for (uint8_t code : type_codes) { - if (code > max_code) { max_code = code; } - } - - // Store mapping in a vector for constant time lookups - std::vector<uint8_t> type_id_to_child_num(max_code + 1); - for (uint8_t i = 0; i < static_cast<uint8_t>(type_codes.size()); ++i) { - type_id_to_child_num[type_codes[i]] = i; - } - - const uint8_t* this_ids = raw_type_ids(); - const uint8_t* other_ids = other.raw_type_ids(); - - uint8_t id, child_num; - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { - if (IsNull(i) != other.IsNull(o_i)) { return false; } - if (IsNull(i)) continue; - if (this_ids[i] != other_ids[o_i]) { return false; } - - id = this_ids[i]; - child_num = type_id_to_child_num[id]; - - // TODO(wesm): really we should be comparing stretches of non-null data - // rather than looking at one value at a time. - if (union_mode == UnionMode::SPARSE) { - if (!child(child_num)->RangeEquals(i, i + 1, o_i, other.child(child_num))) { - return false; - } - } else { - const int32_t offset = offsets_[i]; - const int32_t o_offset = other.offsets_[i]; - if (!child(child_num)->RangeEquals( - offset, offset + 1, o_offset, other.child(child_num))) { - return false; - } - } - } - return true; -} - Status UnionArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } @@ -624,25 +365,6 @@ std::shared_ptr<Array> DictionaryArray::dictionary() const { return dict_type_->dictionary(); } -bool DictionaryArray::EqualsExact(const DictionaryArray& other) const { - if (!dictionary()->Equals(other.dictionary())) { return false; } - return indices_->Equals(other.indices()); -} - -bool DictionaryArray::Equals(const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (Type::DICTIONARY != arr->type_enum()) { return false; } - return EqualsExact(static_cast<const DictionaryArray&>(*arr.get())); -} - -bool DictionaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, - int32_t other_start_idx, const std::shared_ptr<Array>& arr) const { - if (Type::DICTIONARY != arr->type_enum()) { return false; } - const auto& dict_other = static_cast<const DictionaryArray&>(*arr.get()); - if (!dictionary()->Equals(dict_other.dictionary())) { return false; } - return indices_->RangeEquals(start_idx, end_idx, other_start_idx, dict_other.indices()); -} - Status DictionaryArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 4f4b727..3b6e93f 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -102,15 +102,16 @@ class ARROW_EXPORT Array { /// Note that for `null_count == 0`, this can be a `nullptr`. const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } - bool BaseEquals(const std::shared_ptr<Array>& arr) const; - bool EqualsExact(const Array& arr) const; - virtual bool Equals(const std::shared_ptr<Array>& arr) const = 0; - virtual bool ApproxEquals(const std::shared_ptr<Array>& arr) const; + bool Equals(const Array& arr) const; + bool Equals(const std::shared_ptr<Array>& arr) const; + + bool ApproxEquals(const std::shared_ptr<Array>& arr) const; + bool ApproxEquals(const Array& arr) const; /// Compare if the range of slots specified are equal for the given array and /// this array. end_idx exclusive. This methods does not bounds check. - virtual bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const = 0; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr<Array>& arr) const; /// Determines if the array is internally consistent. /// @@ -142,10 +143,6 @@ class ARROW_EXPORT NullArray : public Array { explicit NullArray(int32_t length) : NullArray(std::make_shared<NullType>(), length) {} - bool Equals(const std::shared_ptr<Array>& arr) const override; - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, - const std::shared_ptr<Array>& arr) const override; - Status Accept(ArrayVisitor* visitor) const override; }; @@ -159,9 +156,6 @@ class ARROW_EXPORT PrimitiveArray : public Array { std::shared_ptr<Buffer> data() const { return data_; } - bool EqualsExact(const PrimitiveArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - protected: PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count = 0, @@ -184,28 +178,6 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { const std::shared_ptr<Buffer>& null_bitmap = nullptr) : PrimitiveArray(type, length, data, null_count, null_bitmap) {} - bool EqualsExact(const NumericArray<TypeClass>& other) const { - return PrimitiveArray::EqualsExact(static_cast<const PrimitiveArray&>(other)); - } - - bool ApproxEquals(const std::shared_ptr<Array>& arr) const override { - return PrimitiveArray::Equals(arr); - } - - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - const auto other = static_cast<NumericArray<TypeClass>*>(arr.get()); - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { - const bool is_null = IsNull(i); - if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) { - return false; - } - } - return true; - } const value_type* raw_data() const { return reinterpret_cast<const value_type*>(raw_data_); } @@ -215,78 +187,6 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int i) const { return raw_data()[i]; } }; -template <> -inline bool NumericArray<FloatType>::ApproxEquals( - const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - - const auto& other = *static_cast<NumericArray<FloatType>*>(arr.get()); - - if (this == &other) { return true; } - if (null_count_ != other.null_count_) { return false; } - - auto this_data = reinterpret_cast<const float*>(raw_data_); - auto other_data = reinterpret_cast<const float*>(other.raw_data_); - - static constexpr float EPSILON = 1E-5; - - if (length_ == 0 && other.length_ == 0) { return true; } - - if (null_count_ > 0) { - bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, BitUtil::CeilByte(length_) / 8); - if (!equal_bitmap) { return false; } - - for (int i = 0; i < length_; ++i) { - if (IsNull(i)) continue; - if (fabs(this_data[i] - other_data[i]) > EPSILON) { return false; } - } - } else { - for (int i = 0; i < length_; ++i) { - if (fabs(this_data[i] - other_data[i]) > EPSILON) { return false; } - } - } - return true; -} - -template <> -inline bool NumericArray<DoubleType>::ApproxEquals( - const std::shared_ptr<Array>& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - - const auto& other = *static_cast<NumericArray<DoubleType>*>(arr.get()); - - if (this == &other) { return true; } - if (null_count_ != other.null_count_) { return false; } - - auto this_data = reinterpret_cast<const double*>(raw_data_); - auto other_data = reinterpret_cast<const double*>(other.raw_data_); - - if (length_ == 0 && other.length_ == 0) { return true; } - - static constexpr double EPSILON = 1E-5; - - if (null_count_ > 0) { - bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, BitUtil::CeilByte(length_) / 8); - if (!equal_bitmap) { return false; } - - for (int i = 0; i < length_; ++i) { - if (IsNull(i)) continue; - if (fabs(this_data[i] - other_data[i]) > EPSILON) { return false; } - } - } else { - for (int i = 0; i < length_; ++i) { - if (fabs(this_data[i] - other_data[i]) > EPSILON) { return false; } - } - } - return true; -} - class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: using TypeClass = BooleanType; @@ -297,11 +197,6 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { const std::shared_ptr<Buffer>& data, int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr); - bool EqualsExact(const BooleanArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override; - Status Accept(ArrayVisitor* visitor) const override; const uint8_t* raw_data() const { return reinterpret_cast<const uint8_t*>(raw_data_); } @@ -345,12 +240,6 @@ class ARROW_EXPORT ListArray : public Array { int32_t value_offset(int i) const { return offsets_[i]; } int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } - bool EqualsExact(const ListArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override; - Status Accept(ArrayVisitor* visitor) const override; protected: @@ -396,11 +285,6 @@ class ARROW_EXPORT BinaryArray : public Array { int32_t value_offset(int i) const { return offsets_[i]; } int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } - bool EqualsExact(const BinaryArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override; - Status Validate() const override; Status Accept(ArrayVisitor* visitor) const override; @@ -459,11 +343,6 @@ class ARROW_EXPORT StructArray : public Array { const std::vector<std::shared_ptr<Array>>& fields() const { return field_arrays_; } - bool EqualsExact(const StructArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override; - Status Accept(ArrayVisitor* visitor) const override; protected: @@ -500,11 +379,6 @@ class ARROW_EXPORT UnionArray : public Array { const std::vector<std::shared_ptr<Array>>& children() const { return children_; } - bool EqualsExact(const UnionArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override; - Status Accept(ArrayVisitor* visitor) const override; protected: @@ -555,11 +429,6 @@ class ARROW_EXPORT DictionaryArray : public Array { const DictionaryType* dict_type() { return dict_type_; } - bool EqualsExact(const DictionaryArray& other) const; - bool Equals(const std::shared_ptr<Array>& arr) const override; - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const override; - Status Accept(ArrayVisitor* visitor) const override; protected: http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/compare.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc new file mode 100644 index 0000000..d039bba --- /dev/null +++ b/cpp/src/arrow/compare.cc @@ -0,0 +1,516 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for comparing Arrow data structures + +#include "arrow/compare.h" + +#include <cstdint> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Public method implementations + +class RangeEqualsVisitor : public ArrayVisitor { + public: + RangeEqualsVisitor(const Array& right, int32_t left_start_idx, int32_t left_end_idx, + int32_t right_start_idx) + : right_(right), + left_start_idx_(left_start_idx), + left_end_idx_(left_end_idx), + right_start_idx_(right_start_idx), + result_(false) {} + + Status Visit(const NullArray& left) override { + UNUSED(left); + result_ = true; + return Status::OK(); + } + + template <typename ArrayType> + inline Status CompareValues(const ArrayType& left) { + const auto& right = static_cast<const ArrayType&>(right_); + + for (int32_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + const bool is_null = left.IsNull(i); + if (is_null != right.IsNull(o_i) || + (!is_null && left.Value(i) != right.Value(o_i))) { + result_ = false; + return Status::OK(); + } + } + result_ = true; + return Status::OK(); + } + + bool CompareBinaryRange(const BinaryArray& left) const { + const auto& right = static_cast<const BinaryArray&>(right_); + + for (int32_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + const bool is_null = left.IsNull(i); + if (is_null != right.IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = left.offset(i); + const int32_t end_offset = left.offset(i + 1); + const int32_t right_begin_offset = right.offset(o_i); + const int32_t right_end_offset = right.offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != right_end_offset - right_begin_offset) { + return false; + } + + if (end_offset - begin_offset > 0 && + std::memcmp(left.data()->data() + begin_offset, + right.data()->data() + right_begin_offset, end_offset - begin_offset)) { + return false; + } + } + return true; + } + + Status Visit(const BooleanArray& left) override { + return CompareValues<BooleanArray>(left); + } + + Status Visit(const Int8Array& left) override { return CompareValues<Int8Array>(left); } + + Status Visit(const Int16Array& left) override { + return CompareValues<Int16Array>(left); + } + Status Visit(const Int32Array& left) override { + return CompareValues<Int32Array>(left); + } + Status Visit(const Int64Array& left) override { + return CompareValues<Int64Array>(left); + } + Status Visit(const UInt8Array& left) override { + return CompareValues<UInt8Array>(left); + } + Status Visit(const UInt16Array& left) override { + return CompareValues<UInt16Array>(left); + } + Status Visit(const UInt32Array& left) override { + return CompareValues<UInt32Array>(left); + } + Status Visit(const UInt64Array& left) override { + return CompareValues<UInt64Array>(left); + } + Status Visit(const FloatArray& left) override { + return CompareValues<FloatArray>(left); + } + Status Visit(const DoubleArray& left) override { + return CompareValues<DoubleArray>(left); + } + + Status Visit(const HalfFloatArray& left) override { + return Status::NotImplemented("Half float type"); + } + + Status Visit(const StringArray& left) override { + result_ = CompareBinaryRange(left); + return Status::OK(); + } + + Status Visit(const BinaryArray& left) override { + result_ = CompareBinaryRange(left); + return Status::OK(); + } + + Status Visit(const DateArray& left) override { return CompareValues<DateArray>(left); } + + Status Visit(const TimeArray& left) override { return CompareValues<TimeArray>(left); } + + Status Visit(const TimestampArray& left) override { + return CompareValues<TimestampArray>(left); + } + + Status Visit(const IntervalArray& left) override { + return CompareValues<IntervalArray>(left); + } + + Status Visit(const DecimalArray& left) override { + return Status::NotImplemented("Decimal type"); + } + + bool CompareLists(const ListArray& left) { + const auto& right = static_cast<const ListArray&>(right_); + + const std::shared_ptr<Array>& left_values = left.values(); + const std::shared_ptr<Array>& right_values = right.values(); + + for (int32_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + const bool is_null = left.IsNull(i); + if (is_null != right.IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = left.offset(i); + const int32_t end_offset = left.offset(i + 1); + const int32_t right_begin_offset = right.offset(o_i); + const int32_t right_end_offset = right.offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != right_end_offset - right_begin_offset) { + return false; + } + if (!left_values->RangeEquals( + begin_offset, end_offset, right_begin_offset, right_values)) { + return false; + } + } + return true; + } + + Status Visit(const ListArray& left) override { + result_ = CompareLists(left); + return Status::OK(); + } + + bool CompareStructs(const StructArray& left) { + const auto& right = static_cast<const StructArray&>(right_); + bool equal_fields = true; + for (int32_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + if (left.IsNull(i) != right.IsNull(o_i)) { return false; } + if (left.IsNull(i)) continue; + for (size_t j = 0; j < left.fields().size(); ++j) { + // TODO: really we should be comparing stretches of non-null data rather + // than looking at one value at a time. + equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j)); + if (!equal_fields) { return false; } + } + } + return true; + } + + Status Visit(const StructArray& left) override { + result_ = CompareStructs(left); + return Status::OK(); + } + + bool CompareUnions(const UnionArray& left) const { + const auto& right = static_cast<const UnionArray&>(right_); + + const UnionMode union_mode = left.mode(); + if (union_mode != right.mode()) { return false; } + + const auto& left_type = static_cast<const UnionType&>(*left.type()); + + // Define a mapping from the type id to child number + uint8_t max_code = 0; + + const std::vector<uint8_t> type_codes = left_type.type_ids; + for (size_t i = 0; i < type_codes.size(); ++i) { + const uint8_t code = type_codes[i]; + if (code > max_code) { max_code = code; } + } + + // Store mapping in a vector for constant time lookups + std::vector<uint8_t> type_id_to_child_num(max_code + 1); + for (uint8_t i = 0; i < static_cast<uint8_t>(type_codes.size()); ++i) { + type_id_to_child_num[type_codes[i]] = i; + } + + const uint8_t* left_ids = left.raw_type_ids(); + const uint8_t* right_ids = right.raw_type_ids(); + + uint8_t id, child_num; + for (int32_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + if (left.IsNull(i) != right.IsNull(o_i)) { return false; } + if (left.IsNull(i)) continue; + if (left_ids[i] != right_ids[o_i]) { return false; } + + id = left_ids[i]; + child_num = type_id_to_child_num[id]; + + // TODO(wesm): really we should be comparing stretches of non-null data + // rather than looking at one value at a time. + if (union_mode == UnionMode::SPARSE) { + if (!left.child(child_num)->RangeEquals(i, i + 1, o_i, right.child(child_num))) { + return false; + } + } else { + const int32_t offset = left.raw_offsets()[i]; + const int32_t o_offset = right.raw_offsets()[i]; + if (!left.child(child_num)->RangeEquals( + offset, offset + 1, o_offset, right.child(child_num))) { + return false; + } + } + } + return true; + } + + Status Visit(const UnionArray& left) override { + result_ = CompareUnions(left); + return Status::OK(); + } + + Status Visit(const DictionaryArray& left) override { + const auto& right = static_cast<const DictionaryArray&>(right_); + if (!left.dictionary()->Equals(right.dictionary())) { + result_ = false; + return Status::OK(); + } + result_ = left.indices()->RangeEquals( + left_start_idx_, left_end_idx_, right_start_idx_, right.indices()); + return Status::OK(); + } + + bool result() const { return result_; } + + protected: + const Array& right_; + int32_t left_start_idx_; + int32_t left_end_idx_; + int32_t right_start_idx_; + + bool result_; +}; + +class EqualsVisitor : public RangeEqualsVisitor { + public: + explicit EqualsVisitor(const Array& right) + : RangeEqualsVisitor(right, 0, right.length(), 0) {} + + Status Visit(const NullArray& left) override { return Status::OK(); } + + Status Visit(const BooleanArray& left) override { + const auto& right = static_cast<const BooleanArray&>(right_); + if (left.null_count() > 0) { + const uint8_t* left_data = left.data()->data(); + const uint8_t* right_data = right.data()->data(); + + for (int i = 0; i < left.length(); ++i) { + if (!left.IsNull(i) && + BitUtil::GetBit(left_data, i) != BitUtil::GetBit(right_data, i)) { + result_ = false; + return Status::OK(); + } + } + result_ = true; + } else { + result_ = left.data()->Equals(*right.data(), BitUtil::BytesForBits(left.length())); + } + return Status::OK(); + } + + bool IsEqualPrimitive(const PrimitiveArray& left) { + const auto& right = static_cast<const PrimitiveArray&>(right_); + if (left.null_count() > 0) { + const uint8_t* left_data = left.data()->data(); + const uint8_t* right_data = right.data()->data(); + const auto& size_meta = dynamic_cast<const FixedWidthType&>(*left.type()); + const int value_byte_size = size_meta.bit_width() / 8; + DCHECK_GT(value_byte_size, 0); + + for (int i = 0; i < left.length(); ++i) { + if (!left.IsNull(i) && memcmp(left_data, right_data, value_byte_size)) { + return false; + } + left_data += value_byte_size; + right_data += value_byte_size; + } + return true; + } else { + if (left.length() == 0) { return true; } + return left.data()->Equals(*right.data(), left.length()); + } + } + + Status ComparePrimitive(const PrimitiveArray& left) { + result_ = IsEqualPrimitive(left); + return Status::OK(); + } + + Status Visit(const Int8Array& left) override { return ComparePrimitive(left); } + + Status Visit(const Int16Array& left) override { return ComparePrimitive(left); } + + Status Visit(const Int32Array& left) override { return ComparePrimitive(left); } + + Status Visit(const Int64Array& left) override { return ComparePrimitive(left); } + + Status Visit(const UInt8Array& left) override { return ComparePrimitive(left); } + + Status Visit(const UInt16Array& left) override { return ComparePrimitive(left); } + + Status Visit(const UInt32Array& left) override { return ComparePrimitive(left); } + + Status Visit(const UInt64Array& left) override { return ComparePrimitive(left); } + + Status Visit(const FloatArray& left) override { return ComparePrimitive(left); } + + Status Visit(const DoubleArray& left) override { return ComparePrimitive(left); } + + Status Visit(const DateArray& left) override { return ComparePrimitive(left); } + + Status Visit(const TimeArray& left) override { return ComparePrimitive(left); } + + Status Visit(const TimestampArray& left) override { return ComparePrimitive(left); } + + Status Visit(const IntervalArray& left) override { return ComparePrimitive(left); } + + bool CompareBinary(const BinaryArray& left) { + const auto& right = static_cast<const BinaryArray&>(right_); + bool equal_offsets = + left.offsets()->Equals(*right.offsets(), (left.length() + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } + if (!left.data() && !(right.data())) { return true; } + return left.data()->Equals(*right.data(), left.raw_offsets()[left.length()]); + } + + Status Visit(const StringArray& left) override { + result_ = CompareBinary(left); + return Status::OK(); + } + + Status Visit(const BinaryArray& left) override { + result_ = CompareBinary(left); + return Status::OK(); + } + + Status Visit(const ListArray& left) override { + const auto& right = static_cast<const ListArray&>(right_); + if (!left.offsets()->Equals( + *right.offsets(), (left.length() + 1) * sizeof(int32_t))) { + result_ = false; + } else { + result_ = left.values()->Equals(right.values()); + } + return Status::OK(); + } + + Status Visit(const DictionaryArray& left) override { + const auto& right = static_cast<const DictionaryArray&>(right_); + if (!left.dictionary()->Equals(right.dictionary())) { + result_ = false; + } else { + result_ = left.indices()->Equals(right.indices()); + } + return Status::OK(); + } +}; + +template <typename TYPE> +inline bool FloatingApproxEquals( + const NumericArray<TYPE>& left, const NumericArray<TYPE>& right) { + using T = typename TYPE::c_type; + + auto left_data = reinterpret_cast<const T*>(left.data()->data()); + auto right_data = reinterpret_cast<const T*>(right.data()->data()); + + static constexpr T EPSILON = 1E-5; + + if (left.length() == 0 && right.length() == 0) { return true; } + + if (left.null_count() > 0) { + for (int32_t i = 0; i < left.length(); ++i) { + if (left.IsNull(i)) continue; + if (fabs(left_data[i] - right_data[i]) > EPSILON) { return false; } + } + } else { + for (int32_t i = 0; i < left.length(); ++i) { + if (fabs(left_data[i] - right_data[i]) > EPSILON) { return false; } + } + } + return true; +} + +class ApproxEqualsVisitor : public EqualsVisitor { + public: + using EqualsVisitor::EqualsVisitor; + + Status Visit(const FloatArray& left) override { + result_ = + FloatingApproxEquals<FloatType>(left, static_cast<const FloatArray&>(right_)); + return Status::OK(); + } + + Status Visit(const DoubleArray& left) override { + result_ = + FloatingApproxEquals<DoubleType>(left, static_cast<const DoubleArray&>(right_)); + return Status::OK(); + } +}; + +static bool BaseDataEquals(const Array& left, const Array& right) { + if (left.length() != right.length() || left.null_count() != right.null_count() || + left.type_enum() != right.type_enum()) { + return false; + } + if (left.null_count() > 0) { + return left.null_bitmap()->Equals( + *right.null_bitmap(), BitUtil::BytesForBits(left.length())); + } + return true; +} + +Status ArrayEquals(const Array& left, const Array& right, bool* are_equal) { + // The arrays are the same object + if (&left == &right) { + *are_equal = true; + } else if (!BaseDataEquals(left, right)) { + *are_equal = false; + } else { + EqualsVisitor visitor(right); + RETURN_NOT_OK(left.Accept(&visitor)); + *are_equal = visitor.result(); + } + return Status::OK(); +} + +Status ArrayRangeEquals(const Array& left, const Array& right, int32_t left_start_idx, + int32_t left_end_idx, int32_t right_start_idx, bool* are_equal) { + if (&left == &right) { + *are_equal = true; + } else if (left.type_enum() != right.type_enum()) { + *are_equal = false; + } else { + RangeEqualsVisitor visitor(right, left_start_idx, left_end_idx, right_start_idx); + RETURN_NOT_OK(left.Accept(&visitor)); + *are_equal = visitor.result(); + } + return Status::OK(); +} + +Status ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal) { + // The arrays are the same object + if (&left == &right) { + *are_equal = true; + } else if (!BaseDataEquals(left, right)) { + *are_equal = false; + } else { + ApproxEqualsVisitor visitor(right); + RETURN_NOT_OK(left.Accept(&visitor)); + *are_equal = visitor.result(); + } + return Status::OK(); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/compare.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h new file mode 100644 index 0000000..2093b65 --- /dev/null +++ b/cpp/src/arrow/compare.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for comparing Arrow data structures + +#ifndef ARROW_COMPARE_H +#define ARROW_COMPARE_H + +#include <cstdint> + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Status; + +/// Returns true if the arrays are exactly equal +Status ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, bool* are_equal); + +/// Returns true if the arrays are approximately equal. For non-floating point +/// types, this is equivalent to ArrayEquals(left, right) +Status ARROW_EXPORT ArrayApproxEquals( + const Array& left, const Array& right, bool* are_equal); + +/// Returns true if indicated equal-length segment of arrays is exactly equal +Status ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, + int32_t start_idx, int32_t end_idx, int32_t other_start_idx, bool* are_equal); + +} // namespace arrow + +#endif // ARROW_COMPARE_H http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/cpp/src/arrow/util/macros.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index e2bb355..c4a62a4 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -25,4 +25,6 @@ TypeName& operator=(const TypeName&) = delete #endif +#define UNUSED(x) (void)x + #endif // ARROW_UTIL_MACROS_H http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index d63fff4..942e74b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -74,6 +74,9 @@ include(SetupCxxFlags) # Add common flags set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") +# Enable perf and other tools to work properly +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") + # Suppress Cython warnings set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable") http://git-wip-us.apache.org/repos/asf/arrow/blob/7ac320bd/python/src/pyarrow/adapters/pandas.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index feafa3d..920779f 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -153,14 +153,6 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } -static inline bool PyObject_is_bool(const PyObject* obj) { -#if PY_MAJOR_VERSION >= 3 - return PyString_Check(obj) || PyBytes_Check(obj); -#else - return PyString_Check(obj) || PyUnicode_Check(obj); -#endif -} - template <int TYPE> static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) { typedef npy_traits<TYPE> traits;
