Repository: arrow Updated Branches: refs/heads/master c322cbf22 -> 1407abfc9
ARROW-537: [C++] Do not compare String/Binary data in null slots when comparing arrays Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #327 from wesm/ARROW-537 and squashes the following commits: 66b1961 [Wes McKinney] Do not compare String/Binary data in null slots when comparing arrays Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1407abfc Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1407abfc Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1407abfc Branch: refs/heads/master Commit: 1407abfc90c03e133f198b59fed48469d171c0a9 Parents: c322cbf Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Wed Feb 8 09:16:57 2017 +0100 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Wed Feb 8 09:16:57 2017 +0100 ---------------------------------------------------------------------- cpp/src/arrow/array-string-test.cc | 41 ++++++++++++++++++++++ cpp/src/arrow/array.cc | 11 ++++-- cpp/src/arrow/array.h | 9 +++-- cpp/src/arrow/compare.cc | 55 +++++++++++++++++++----------- python/src/pyarrow/adapters/pandas.cc | 12 +++---- 5 files changed, 95 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array-string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index 8b7eb41..c4d9bf4 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -140,6 +140,47 @@ TEST_F(TestStringArray, TestEmptyStringComparison) { ASSERT_TRUE(strings_a->Equals(strings_b)); } +TEST_F(TestStringArray, CompareNullByteSlots) { + StringBuilder builder(default_memory_pool()); + StringBuilder builder2(default_memory_pool()); + StringBuilder builder3(default_memory_pool()); + + builder.Append("foo"); + builder2.Append("foo"); + builder3.Append("foo"); + + builder.Append("bar"); + builder2.AppendNull(); + + // same length, but different + builder3.Append("xyz"); + + builder.Append("baz"); + builder2.Append("baz"); + builder3.Append("baz"); + + std::shared_ptr<Array> array, array2, array3; + ASSERT_OK(builder.Finish(&array)); + ASSERT_OK(builder2.Finish(&array2)); + ASSERT_OK(builder3.Finish(&array3)); + + const auto& a1 = static_cast<const StringArray&>(*array); + const auto& a2 = static_cast<const StringArray&>(*array2); + const auto& a3 = static_cast<const StringArray&>(*array3); + + // The validity bitmaps are the same, the data is different, but the unequal + // portion is masked out + StringArray equal_array(3, a1.value_offsets(), a1.data(), a2.null_bitmap(), 1); + StringArray equal_array2(3, a3.value_offsets(), a3.data(), a2.null_bitmap(), 1); + + ASSERT_TRUE(equal_array.Equals(equal_array2)); + ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0)); + + ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1))); + ASSERT_TRUE( + equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); +} + // ---------------------------------------------------------------------- // String builder tests http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index f84023e..39459a0 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -87,11 +87,16 @@ bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const { } bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const { - if (!arr) { return false; } + const std::shared_ptr<Array>& other) const { + if (!other) { return false; } + return RangeEquals(*other, start_idx, end_idx, other_start_idx); +} + +bool Array::RangeEquals(const Array& other, int32_t start_idx, int32_t end_idx, + int32_t other_start_idx) const { bool are_equal = false; Status error = - ArrayRangeEquals(*this, *arr, start_idx, end_idx, other_start_idx, &are_equal); + ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, &are_equal); if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } return are_equal; } http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index f3e8f9a..32d156b 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -127,7 +127,10 @@ class ARROW_EXPORT Array { /// Compare if the range of slots specified are equal for the given array and /// this array. end_idx exclusive. This methods does not bounds check. bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, - const std::shared_ptr<Array>& arr) const; + const std::shared_ptr<Array>& other) const; + + bool RangeEquals(const Array& other, int32_t start_idx, int32_t end_idx, + int32_t other_start_idx) const; /// Determines if the array is internally consistent. /// @@ -315,8 +318,8 @@ class ARROW_EXPORT BinaryArray : public Array { // Account for base offset i += offset_; - const int32_t pos = raw_value_offsets_[i]; - *out_length = raw_value_offsets_[i + 1] - pos; + const int32_t pos = raw_value_offsets_[i + offset_]; + *out_length = raw_value_offsets_[i + offset_ + 1] - pos; return raw_data_ + pos; } http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/compare.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 27fad71..21fdb66 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -335,15 +335,8 @@ class EqualsVisitor : public RangeEqualsVisitor { const int value_byte_size = size_meta.bit_width() / 8; DCHECK_GT(value_byte_size, 0); - const uint8_t* left_data = nullptr; - if (left.length() > 0) { - left_data = left.data()->data() + left.offset() * value_byte_size; - } - - const uint8_t* right_data = nullptr; - if (right.length() > 0) { - right_data = right.data()->data() + right.offset() * value_byte_size; - } + const uint8_t* left_data = left.data()->data() + left.offset() * value_byte_size; + const uint8_t* right_data = right.data()->data() + right.offset() * value_byte_size; if (left.null_count() > 0) { for (int i = 0; i < left.length(); ++i) { @@ -355,7 +348,6 @@ class EqualsVisitor : public RangeEqualsVisitor { } return true; } else { - if (left.length() == 0) { return true; } return memcmp(left_data, right_data, value_byte_size * left.length()) == 0; } } @@ -424,14 +416,35 @@ class EqualsVisitor : public RangeEqualsVisitor { bool equal_offsets = ValueOffsetsEqual<BinaryArray>(left); if (!equal_offsets) { return false; } - if (left.offset() == 0 && right.offset() == 0) { - if (!left.data() && !(right.data())) { return true; } - return left.data()->Equals(*right.data(), left.raw_value_offsets()[left.length()]); + if (!left.data() && !(right.data())) { return true; } + if (left.value_offset(left.length()) == 0) { return true; } + + const uint8_t* left_data = left.data()->data(); + const uint8_t* right_data = right.data()->data(); + + if (left.null_count() == 0) { + // Fast path for null count 0, single memcmp + if (left.offset() == 0 && right.offset() == 0) { + return std::memcmp( + left_data, right_data, left.raw_value_offsets()[left.length()]) == 0; + } else { + const int64_t total_bytes = + left.value_offset(left.length()) - left.value_offset(0); + return std::memcmp(left_data + left.value_offset(0), + right_data + right.value_offset(0), total_bytes) == 0; + } } else { - // Compare the corresponding data range - const int64_t total_bytes = left.value_offset(left.length()) - left.value_offset(0); - return std::memcmp(left.data()->data() + left.value_offset(0), - right.data()->data() + right.value_offset(0), total_bytes) == 0; + // ARROW-537: Only compare data in non-null slots + const int32_t* left_offsets = left.raw_value_offsets(); + const int32_t* right_offsets = right.raw_value_offsets(); + for (int32_t i = 0; i < left.length(); ++i) { + if (left.IsNull(i)) { continue; } + if (std::memcmp(left_data + left_offsets[i], right_data + right_offsets[i], + left.value_length(i))) { + return false; + } + } + return true; } } @@ -485,8 +498,6 @@ inline bool FloatingApproxEquals( static constexpr T EPSILON = 1E-5; - if (left.length() == 0 && right.length() == 0) { return true; } - if (left.null_count() > 0) { for (int32_t i = 0; i < left.length(); ++i) { if (left.IsNull(i)) continue; @@ -535,6 +546,8 @@ Status ArrayEquals(const Array& left, const Array& right, bool* are_equal) { *are_equal = true; } else if (!BaseDataEquals(left, right)) { *are_equal = false; + } else if (left.length() == 0) { + *are_equal = true; } else { EqualsVisitor visitor(right); RETURN_NOT_OK(left.Accept(&visitor)); @@ -549,6 +562,8 @@ Status ArrayRangeEquals(const Array& left, const Array& right, int32_t left_star *are_equal = true; } else if (left.type_enum() != right.type_enum()) { *are_equal = false; + } else if (left.length() == 0) { + *are_equal = true; } else { RangeEqualsVisitor visitor(right, left_start_idx, left_end_idx, right_start_idx); RETURN_NOT_OK(left.Accept(&visitor)); @@ -563,6 +578,8 @@ Status ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal) *are_equal = true; } else if (!BaseDataEquals(left, right)) { *are_equal = false; + } else if (left.length() == 0) { + *are_equal = true; } else { ApproxEqualsVisitor visitor(right); RETURN_NOT_OK(left.Accept(&visitor)); http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/python/src/pyarrow/adapters/pandas.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index b4e0d2f..bdc2cb7 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -1717,12 +1717,8 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, #if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8) // Both LONGLONG and INT64 can be observed in the wild, which is buggy. We set // U/LONGLONG to U/INT64 so things work properly. - if (type_num == NPY_LONGLONG) { - type_num = NPY_INT64; - } - if (type_num == NPY_ULONGLONG) { - type_num = NPY_UINT64; - } + if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; } + if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; } #endif switch (type_num) { @@ -1732,14 +1728,14 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, TO_ARROW_CASE(INT32); TO_ARROW_CASE(INT64); #if (NPY_INT64 != NPY_LONGLONG) - TO_ARROW_CASE(LONGLONG); + TO_ARROW_CASE(LONGLONG); #endif TO_ARROW_CASE(UINT8); TO_ARROW_CASE(UINT16); TO_ARROW_CASE(UINT32); TO_ARROW_CASE(UINT64); #if (NPY_UINT64 != NPY_ULONGLONG) - TO_ARROW_CASE(ULONGLONG); + TO_ARROW_CASE(ULONGLONG); #endif TO_ARROW_CASE(FLOAT32); TO_ARROW_CASE(FLOAT64);