pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528622512
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y;
});
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) ->
bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) &&
std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) ->
bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t
left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i <
left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ ==
left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached
null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset +
right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i <
left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const
BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType&
right,
- offset_type left_offset, offset_type
right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues,
right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type
right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues,
right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues,
right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const
TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i <
left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data
rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i,
right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i <
left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i,
right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset +
j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const
DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ +
left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ +
right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
Review comment:
That will be part of the JIRA followup (investigate replacing
BitmapWordReader with BitmapUInt64Reader).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]