bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370618960
##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
return LogicalNullCount<int64_t>(span);
}
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t
i) {
+ DCHECK_LT(i, self.array_span.length);
+ const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+ DCHECK_LT(self.last_physical_index, run_ends_size);
+ // This access to self.run_ends_[last_physical_index_] is always safe
because:
+ // 1. 0 <= i < array_span_.length() implies there is at least one run and
the initial
+ // value 0 will be safe to index with.
+ // 2. last_physical_index_ > 0 is always the result of a valid call to
+ // internal::FindPhysicalIndex.
+ if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+ self.run_ends[self.last_physical_index])) {
+ // The cached value is an upper-bound, but is it the least upper-bound?
+ if (self.last_physical_index == 0 ||
+ self.array_span.offset + i >= self.run_ends[self.last_physical_index -
1]) {
+ return self.last_physical_index;
+ }
+ // last_physical_index_ - 1 is a candidate for the least upper-bound,
+ // so search for the least upper-bound in the range that includes it.
+ const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+ self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+ self.array_span.offset);
+ DCHECK_LT(j, self.last_physical_index);
+ return self.last_physical_index = j;
Review Comment:
Since the return value is always `last_physical_index`, please change this
function to return void and have `FindPhysicalIndexImplN` return that explicitly
##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
return LogicalNullCount<int64_t>(span);
}
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t
i) {
+ DCHECK_LT(i, self.array_span.length);
+ const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+ DCHECK_LT(self.last_physical_index, run_ends_size);
+ // This access to self.run_ends_[last_physical_index_] is always safe
because:
+ // 1. 0 <= i < array_span_.length() implies there is at least one run and
the initial
+ // value 0 will be safe to index with.
+ // 2. last_physical_index_ > 0 is always the result of a valid call to
Review Comment:
Nit: please ensure the identifiers in comments are consistent with the source
```suggestion
// This access to self.run_ends[last_physical_index] is always safe
because:
// 1. 0 <= i < array_span.length implies there is at least one run and the
initial
// value 0 will be safe to index with.
// 2. last_physical_index > 0 is always the result of a valid call to
```
##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
return LogicalNullCount<int64_t>(span);
}
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t
i) {
+ DCHECK_LT(i, self.array_span.length);
+ const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+ DCHECK_LT(self.last_physical_index, run_ends_size);
+ // This access to self.run_ends_[last_physical_index_] is always safe
because:
+ // 1. 0 <= i < array_span_.length() implies there is at least one run and
the initial
+ // value 0 will be safe to index with.
+ // 2. last_physical_index_ > 0 is always the result of a valid call to
+ // internal::FindPhysicalIndex.
+ if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+ self.run_ends[self.last_physical_index])) {
+ // The cached value is an upper-bound, but is it the least upper-bound?
+ if (self.last_physical_index == 0 ||
+ self.array_span.offset + i >= self.run_ends[self.last_physical_index -
1]) {
+ return self.last_physical_index;
+ }
+ // last_physical_index_ - 1 is a candidate for the least upper-bound,
+ // so search for the least upper-bound in the range that includes it.
+ const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+ self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+ self.array_span.offset);
+ DCHECK_LT(j, self.last_physical_index);
+ return self.last_physical_index = j;
+ }
+
+ // last_physical_index_ is not an upper-bound, and the logical index i MUST
be
+ // in the runs that follow it. Since i is a valid logical index, we know
that at least
+ // one extra run is present.
+ DCHECK_LT(self.last_physical_index + 1, run_ends_size);
+ const int64_t lower_physical_index = self.last_physical_index + 1;
Review Comment:
nit: I understand you mean "lower bound on physical index", but a casual
reader would probably find "min" more obvious
```suggestion
const int64_t min_physical_index = self.last_physical_index + 1;
```
##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t
index) {
return UnitSlice{&array, index};
}
-using ValueComparator = std::function<bool(const Array&, int64_t, const
Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+ virtual ~ValueComparator() = default;
+
+ /// \brief Compare the validity and values at the given indices in the base
and target
+ /// arrays.
+ ///
+ /// \param base_index The index in the base array.
+ /// \param target_index The index in the target array.
+ /// \return true if the values at the given indices are equal, false
otherwise.
+ /// \pre base_index and target_index are valid indices in their respective
arrays.
+ virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+ /// \brief Return the run length of equal values starting at the given
indices in the
+ /// base and target arrays.
+ ///
+ /// \param base_index The starting index in the base array.
+ /// \param base_length The length of the base array.
+ /// \param target_index The starting index in the target array.
+ /// \param target_length The length of the target array.
+ /// \return The run length of equal values starting at the given indices in
the base
+ /// and target arrays.
+ virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t
base_length,
+ int64_t target_index, int64_t
target_length) {
+ int64_t run_length_of_equals = 0;
+ while (base_index < base_length && target_index < target_length) {
+ if (!Equals(base_index, target_index)) {
+ break;
+ }
+ base_index += 1;
+ target_index += 1;
+ run_length_of_equals += 1;
+ }
+ return run_length_of_equals;
+ }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+ const ArrayType& base;
+ const ArrayType& target;
+
+ DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+ : base(base), target(target) {}
+
+ ~DefaultValueComparator() override = default;
+
+ /// \brief Compare validity bits of base[base_index] and target[target_index]
+ ///
+ /// \return std::nullopt if the validity bits differ, otherwise a bool
+ /// containing the validity bit found in both arrays.
+ static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t
base_index,
+ const ArrayType& target,
+ int64_t target_index) {
+ const bool base_valid = base.IsValid(base_index);
+ const bool target_valid = target.IsValid(target_index);
+ return base_valid ^ target_valid ? std::nullopt :
std::optional<bool>{base_valid};
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const auto valid = ValidityIfEquals(base, base_index, target,
target_index);
+ if (valid.has_value()) {
+ return *valid ? GetView(base, base_index) == GetView(target,
target_index) : true;
+ }
+ return false;
+ }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+ const RunEndEncodedArray& base_;
+ const RunEndEncodedArray& target_;
+ std::unique_ptr<ValueComparator> inner_value_comparator_;
+ ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+ ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+ REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray&
target,
+ std::unique_ptr<ValueComparator>&& inner_value_comparator)
+ : base_(base),
+ target_(target),
+ inner_value_comparator_(std::move(inner_value_comparator)),
+ base_physical_index_finder_(*base_.data()),
+ target_physical_index_finder_(*target_.data()) {
+ DCHECK_EQ(*base_.type(), *target_.type());
+ }
+
+ ~REEValueComparator() override = default;
-struct ValueComparatorVisitor {
+ private:
+ /// \pre 0 <= i < base_.length()
+ inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+ return base_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ /// \pre 0 <= i < target_.length()
+ inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+ return target_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ const RunEndCType* base_run_ends() { return
base_physical_index_finder_.run_ends; }
+
+ const RunEndCType* target_run_ends() { return
target_physical_index_finder_.run_ends; }
+
+ public:
+ int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+ int64_t target_index, int64_t target_length)
override {
+ // Ensure the first search for physical index on the values arrays is safe.
+ if (base_index >= base_length || target_index >= target_length) {
+ // Without values on either side, there is no run of equal values.
+ return 0;
+ }
+
+ // Translate the two logical indices into physical indices.
+ int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+ int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+ int64_t run_length_of_equals = 0;
+ // The loop invariant (base_index < base_length && target_index <
target_length)
+ // is valid when the loop starts because of the check above.
+ for (;;) {
+ const auto base_run_end =
+ static_cast<int64_t>(base_run_ends()[physical_base_index]) -
base_.offset();
+ const auto target_run_end =
+ static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+ target_.offset();
+ // The end of the runs containing the logical indices, by definition,
ends
+ // after the logical indices.
+ DCHECK_LT(base_index, base_run_end);
+ DCHECK_LT(target_index, target_run_end);
+
+ // Compare the physical values that make up the runs containing
base_index
+ // and target_index.
+ if (!inner_value_comparator_->Equals(physical_base_index,
physical_target_index)) {
+ // First difference found, stop because the run of equal values cannot
+ // be extended further.
+ break;
+ }
+
+ const int64_t base_run = std::min(base_run_end, base_length) -
base_index;
+ const int64_t target_run = std::min(target_run_end, target_length) -
target_index;
+ // Due to the loop-invariant (base_index < base_length && target_index <
+ // target_length) and properties of the run-ends asserted above, both
base_run and
+ // target_run are strictly greater than zero.
+ DCHECK_GT(base_run, 0);
+ DCHECK_GT(target_run, 0);
+
+ int64_t increment = 0;
+ if (base_run < target_run) {
+ increment = base_run;
+ physical_base_index += 1; // skip to next run on base
+ } else if (base_run > target_run) {
+ increment = target_run;
+ physical_target_index += 1; // skip to next run on target
+ } else {
+ increment = base_run;
+ // skip to next run on both base and target
+ physical_base_index += 1;
+ physical_target_index += 1;
+ }
Review Comment:
```suggestion
int64_t increment = std::min(base_run, target_run);
physical_base_index += base_run == increment;
physical_target_index += target_run == increment;
```
##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t
index) {
return UnitSlice{&array, index};
}
-using ValueComparator = std::function<bool(const Array&, int64_t, const
Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+ virtual ~ValueComparator() = default;
+
+ /// \brief Compare the validity and values at the given indices in the base
and target
+ /// arrays.
+ ///
+ /// \param base_index The index in the base array.
+ /// \param target_index The index in the target array.
+ /// \return true if the values at the given indices are equal, false
otherwise.
+ /// \pre base_index and target_index are valid indices in their respective
arrays.
+ virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+ /// \brief Return the run length of equal values starting at the given
indices in the
+ /// base and target arrays.
+ ///
+ /// \param base_index The starting index in the base array.
+ /// \param base_length The length of the base array.
+ /// \param target_index The starting index in the target array.
+ /// \param target_length The length of the target array.
+ /// \return The run length of equal values starting at the given indices in
the base
+ /// and target arrays.
+ virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t
base_length,
+ int64_t target_index, int64_t
target_length) {
+ int64_t run_length_of_equals = 0;
+ while (base_index < base_length && target_index < target_length) {
+ if (!Equals(base_index, target_index)) {
+ break;
+ }
+ base_index += 1;
+ target_index += 1;
+ run_length_of_equals += 1;
+ }
+ return run_length_of_equals;
+ }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+ const ArrayType& base;
+ const ArrayType& target;
+
+ DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+ : base(base), target(target) {}
+
+ ~DefaultValueComparator() override = default;
+
+ /// \brief Compare validity bits of base[base_index] and target[target_index]
+ ///
+ /// \return std::nullopt if the validity bits differ, otherwise a bool
+ /// containing the validity bit found in both arrays.
+ static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t
base_index,
+ const ArrayType& target,
+ int64_t target_index) {
+ const bool base_valid = base.IsValid(base_index);
+ const bool target_valid = target.IsValid(target_index);
+ return base_valid ^ target_valid ? std::nullopt :
std::optional<bool>{base_valid};
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const auto valid = ValidityIfEquals(base, base_index, target,
target_index);
+ if (valid.has_value()) {
+ return *valid ? GetView(base, base_index) == GetView(target,
target_index) : true;
+ }
+ return false;
+ }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+ const RunEndEncodedArray& base_;
+ const RunEndEncodedArray& target_;
+ std::unique_ptr<ValueComparator> inner_value_comparator_;
+ ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+ ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+ REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray&
target,
+ std::unique_ptr<ValueComparator>&& inner_value_comparator)
+ : base_(base),
+ target_(target),
+ inner_value_comparator_(std::move(inner_value_comparator)),
+ base_physical_index_finder_(*base_.data()),
+ target_physical_index_finder_(*target_.data()) {
+ DCHECK_EQ(*base_.type(), *target_.type());
+ }
+
+ ~REEValueComparator() override = default;
-struct ValueComparatorVisitor {
+ private:
+ /// \pre 0 <= i < base_.length()
+ inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+ return base_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ /// \pre 0 <= i < target_.length()
+ inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+ return target_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ const RunEndCType* base_run_ends() { return
base_physical_index_finder_.run_ends; }
+
+ const RunEndCType* target_run_ends() { return
target_physical_index_finder_.run_ends; }
+
+ public:
+ int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+ int64_t target_index, int64_t target_length)
override {
+ // Ensure the first search for physical index on the values arrays is safe.
+ if (base_index >= base_length || target_index >= target_length) {
+ // Without values on either side, there is no run of equal values.
+ return 0;
+ }
+
+ // Translate the two logical indices into physical indices.
+ int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+ int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+ int64_t run_length_of_equals = 0;
+ // The loop invariant (base_index < base_length && target_index <
target_length)
+ // is valid when the loop starts because of the check above.
+ for (;;) {
+ const auto base_run_end =
+ static_cast<int64_t>(base_run_ends()[physical_base_index]) -
base_.offset();
+ const auto target_run_end =
+ static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+ target_.offset();
+ // The end of the runs containing the logical indices, by definition,
ends
+ // after the logical indices.
+ DCHECK_LT(base_index, base_run_end);
+ DCHECK_LT(target_index, target_run_end);
+
+ // Compare the physical values that make up the runs containing
base_index
+ // and target_index.
+ if (!inner_value_comparator_->Equals(physical_base_index,
physical_target_index)) {
+ // First difference found, stop because the run of equal values cannot
+ // be extended further.
+ break;
+ }
+
+ const int64_t base_run = std::min(base_run_end, base_length) -
base_index;
+ const int64_t target_run = std::min(target_run_end, target_length) -
target_index;
+ // Due to the loop-invariant (base_index < base_length && target_index <
+ // target_length) and properties of the run-ends asserted above, both
base_run and
+ // target_run are strictly greater than zero.
+ DCHECK_GT(base_run, 0);
+ DCHECK_GT(target_run, 0);
+
+ int64_t increment = 0;
+ if (base_run < target_run) {
+ increment = base_run;
+ physical_base_index += 1; // skip to next run on base
+ } else if (base_run > target_run) {
+ increment = target_run;
+ physical_target_index += 1; // skip to next run on target
+ } else {
+ increment = base_run;
+ // skip to next run on both base and target
+ physical_base_index += 1;
+ physical_target_index += 1;
+ }
+ // Since both base_run and target_run are greater than zero,
+ // increment is also greater than zero...
+ DCHECK_GT(increment, 0);
+ // ...which implies that the loop will make progress and eventually
terminate
+ // because base_index or target_index will equal base_length or
target_length,
+ // respectively.
+ base_index += increment;
+ target_index += increment;
+ // The value representing the two runs are equal, so we can assume that
at
+ // least `increment` (size of smallest run) values are equal.
+ run_length_of_equals += increment;
+
+ if (base_index >= base_length || target_index >= target_length) {
+ break;
+ }
+ }
+
+ return run_length_of_equals;
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+ const int64_t physical_target_index =
FindPhysicalIndexOnTarget(target_index);
+ return inner_value_comparator_->Equals(physical_base_index,
physical_target_index);
+ }
+};
+
+class CreateValueComparator {
+ private:
+ std::unique_ptr<ValueComparator> comparator_;
+
+ public:
template <typename T>
- Status Visit(const T&) {
+ Status Visit(const T&, const Array& base, const Array& target) {
using ArrayType = typename TypeTraits<T>::ArrayType;
- out = [](const Array& base, int64_t base_index, const Array& target,
- int64_t target_index) {
- return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
- GetView(checked_cast<const ArrayType&>(target), target_index));
- };
+ comparator_ = std::make_unique<DefaultValueComparator<ArrayType>>(
+ checked_cast<const ArrayType&>(base), checked_cast<const
ArrayType&>(target));
return Status::OK();
}
- Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+ Status Visit(const NullType&, const Array&, const Array&) {
+ return Status::NotImplemented("null type");
+ }
- Status Visit(const ExtensionType&) { return
Status::NotImplemented("extension type"); }
+ Status Visit(const ExtensionType&, const Array&, const Array&) {
+ return Status::NotImplemented("extension type");
+ }
- Status Visit(const DictionaryType&) {
+ Status Visit(const DictionaryType&, const Array& base, const Array& target) {
return Status::NotImplemented("dictionary type");
}
- Status Visit(const RunEndEncodedType&) {
- return Status::NotImplemented("run-end encoded type");
+ Status Visit(const RunEndEncodedType& ree_type, const Array& base,
+ const Array& target) {
+ const auto& base_ree = checked_cast<const RunEndEncodedArray&>(base);
+ const auto& target_ree = checked_cast<const RunEndEncodedArray&>(target);
+ ARROW_ASSIGN_OR_RAISE(
+ auto inner_values_comparator,
+ (*this)(*ree_type.value_type(), *base_ree.values(),
*target_ree.values()));
+ auto* ree_value_comparator =
+ ([&ree_type, &base_ree, &target_ree,
+ inner_values_comparator =
+ std::move(inner_values_comparator)]() mutable ->
ValueComparator* {
+ const auto run_end_type_id = ree_type.run_end_type()->id();
+ switch (run_end_type_id) {
+ case Type::INT16:
+ return new REEValueComparator<int16_t>(base_ree, target_ree,
+
std::move(inner_values_comparator));
+ case Type::INT32:
+ return new REEValueComparator<int32_t>(base_ree, target_ree,
+
std::move(inner_values_comparator));
+ default:
+ DCHECK_EQ(run_end_type_id, Type::INT64);
+ return new REEValueComparator<int64_t>(base_ree, target_ree,
+
std::move(inner_values_comparator));
+ }
+ }());
+ comparator_ = std::unique_ptr<ValueComparator>{ree_value_comparator};
+ return Status::OK();
Review Comment:
```suggestion
switch (ree_type.run_end_type()->id()) {
case Type::INT16:
comparator_.reset(new REEValueComparator<int16_t>(base_ree,
target_ree,
std::move(inner_values_comparator)));
return Status::OK();
case Type::INT32:
comparator_.reset(new REEValueComparator<int32_t>(base_ree,
target_ree,
std::move(inner_values_comparator)));
return Status::OK();
case Type::INT64:
comparator_.reset(new REEValueComparator<int64_t>(base_ree,
target_ree,
std::move(inner_values_comparator)));
return Status::OK();
default:
Unreachable();
}
```
##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t
index) {
return UnitSlice{&array, index};
}
-using ValueComparator = std::function<bool(const Array&, int64_t, const
Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+ virtual ~ValueComparator() = default;
+
+ /// \brief Compare the validity and values at the given indices in the base
and target
+ /// arrays.
+ ///
+ /// \param base_index The index in the base array.
+ /// \param target_index The index in the target array.
+ /// \return true if the values at the given indices are equal, false
otherwise.
+ /// \pre base_index and target_index are valid indices in their respective
arrays.
+ virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+ /// \brief Return the run length of equal values starting at the given
indices in the
+ /// base and target arrays.
+ ///
+ /// \param base_index The starting index in the base array.
+ /// \param base_length The length of the base array.
+ /// \param target_index The starting index in the target array.
+ /// \param target_length The length of the target array.
+ /// \return The run length of equal values starting at the given indices in
the base
+ /// and target arrays.
+ virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t
base_length,
+ int64_t target_index, int64_t
target_length) {
+ int64_t run_length_of_equals = 0;
+ while (base_index < base_length && target_index < target_length) {
+ if (!Equals(base_index, target_index)) {
+ break;
+ }
+ base_index += 1;
+ target_index += 1;
+ run_length_of_equals += 1;
+ }
+ return run_length_of_equals;
+ }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+ const ArrayType& base;
+ const ArrayType& target;
+
+ DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+ : base(base), target(target) {}
+
+ ~DefaultValueComparator() override = default;
+
+ /// \brief Compare validity bits of base[base_index] and target[target_index]
+ ///
+ /// \return std::nullopt if the validity bits differ, otherwise a bool
+ /// containing the validity bit found in both arrays.
+ static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t
base_index,
+ const ArrayType& target,
+ int64_t target_index) {
+ const bool base_valid = base.IsValid(base_index);
+ const bool target_valid = target.IsValid(target_index);
+ return base_valid ^ target_valid ? std::nullopt :
std::optional<bool>{base_valid};
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const auto valid = ValidityIfEquals(base, base_index, target,
target_index);
Review Comment:
Please make this more obvious/explicit:
```suggestion
bool is_valid = base.IsValid(base_index);
if (target.IsValid(target_index) != is_valid) return false;
if (!is_valid) return true;
return GetView(base, base_index) == GetView(target, target_index);
```
##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
: base_(base),
target_(target),
pool_(pool),
- value_comparator_(GetValueComparator(*base.type())),
- base_begin_(0),
base_end_(base.length()),
- target_begin_(0),
target_end_(target.length()),
- endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
insert_({true}) {
- if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
- endpoint_base_[0] == base_end_) {
- // trivial case: base == target
- finish_index_ = 0;
- }
- }
-
- bool ValuesEqual(int64_t base_index, int64_t target_index) const {
- bool base_null = base_.IsNull(base_index);
- bool target_null = target_.IsNull(target_index);
- if (base_null || target_null) {
- // If only one is null, then this is false, otherwise true
- return base_null && target_null;
- }
- return value_comparator_(base_, base_index, target_, target_index);
+ // endpoint_base_ is initialized when Diff() is called to start the
algorithm.
Review Comment:
Could we have the comparator also be a member, initialized at the same time?
##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t
index) {
return UnitSlice{&array, index};
}
-using ValueComparator = std::function<bool(const Array&, int64_t, const
Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+ virtual ~ValueComparator() = default;
+
+ /// \brief Compare the validity and values at the given indices in the base
and target
+ /// arrays.
+ ///
+ /// \param base_index The index in the base array.
+ /// \param target_index The index in the target array.
+ /// \return true if the values at the given indices are equal, false
otherwise.
+ /// \pre base_index and target_index are valid indices in their respective
arrays.
+ virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+ /// \brief Return the run length of equal values starting at the given
indices in the
+ /// base and target arrays.
+ ///
+ /// \param base_index The starting index in the base array.
+ /// \param base_length The length of the base array.
+ /// \param target_index The starting index in the target array.
+ /// \param target_length The length of the target array.
+ /// \return The run length of equal values starting at the given indices in
the base
+ /// and target arrays.
+ virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t
base_length,
+ int64_t target_index, int64_t
target_length) {
+ int64_t run_length_of_equals = 0;
+ while (base_index < base_length && target_index < target_length) {
+ if (!Equals(base_index, target_index)) {
+ break;
+ }
+ base_index += 1;
+ target_index += 1;
+ run_length_of_equals += 1;
+ }
+ return run_length_of_equals;
+ }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+ const ArrayType& base;
+ const ArrayType& target;
+
+ DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+ : base(base), target(target) {}
+
+ ~DefaultValueComparator() override = default;
+
+ /// \brief Compare validity bits of base[base_index] and target[target_index]
+ ///
+ /// \return std::nullopt if the validity bits differ, otherwise a bool
+ /// containing the validity bit found in both arrays.
+ static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t
base_index,
+ const ArrayType& target,
+ int64_t target_index) {
+ const bool base_valid = base.IsValid(base_index);
+ const bool target_valid = target.IsValid(target_index);
+ return base_valid ^ target_valid ? std::nullopt :
std::optional<bool>{base_valid};
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const auto valid = ValidityIfEquals(base, base_index, target,
target_index);
+ if (valid.has_value()) {
+ return *valid ? GetView(base, base_index) == GetView(target,
target_index) : true;
+ }
+ return false;
+ }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+ const RunEndEncodedArray& base_;
+ const RunEndEncodedArray& target_;
+ std::unique_ptr<ValueComparator> inner_value_comparator_;
+ ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+ ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+ REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray&
target,
+ std::unique_ptr<ValueComparator>&& inner_value_comparator)
+ : base_(base),
+ target_(target),
+ inner_value_comparator_(std::move(inner_value_comparator)),
+ base_physical_index_finder_(*base_.data()),
+ target_physical_index_finder_(*target_.data()) {
+ DCHECK_EQ(*base_.type(), *target_.type());
+ }
+
+ ~REEValueComparator() override = default;
-struct ValueComparatorVisitor {
+ private:
+ /// \pre 0 <= i < base_.length()
+ inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+ return base_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ /// \pre 0 <= i < target_.length()
+ inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+ return target_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ const RunEndCType* base_run_ends() { return
base_physical_index_finder_.run_ends; }
+
+ const RunEndCType* target_run_ends() { return
target_physical_index_finder_.run_ends; }
+
+ public:
+ int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+ int64_t target_index, int64_t target_length)
override {
+ // Ensure the first search for physical index on the values arrays is safe.
+ if (base_index >= base_length || target_index >= target_length) {
+ // Without values on either side, there is no run of equal values.
+ return 0;
+ }
+
+ // Translate the two logical indices into physical indices.
+ int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+ int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+ int64_t run_length_of_equals = 0;
+ // The loop invariant (base_index < base_length && target_index <
target_length)
+ // is valid when the loop starts because of the check above.
+ for (;;) {
+ const auto base_run_end =
+ static_cast<int64_t>(base_run_ends()[physical_base_index]) -
base_.offset();
+ const auto target_run_end =
+ static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+ target_.offset();
+ // The end of the runs containing the logical indices, by definition,
ends
+ // after the logical indices.
+ DCHECK_LT(base_index, base_run_end);
+ DCHECK_LT(target_index, target_run_end);
+
+ // Compare the physical values that make up the runs containing
base_index
+ // and target_index.
+ if (!inner_value_comparator_->Equals(physical_base_index,
physical_target_index)) {
+ // First difference found, stop because the run of equal values cannot
+ // be extended further.
+ break;
+ }
+
+ const int64_t base_run = std::min(base_run_end, base_length) -
base_index;
+ const int64_t target_run = std::min(target_run_end, target_length) -
target_index;
+ // Due to the loop-invariant (base_index < base_length && target_index <
+ // target_length) and properties of the run-ends asserted above, both
base_run and
+ // target_run are strictly greater than zero.
+ DCHECK_GT(base_run, 0);
+ DCHECK_GT(target_run, 0);
+
+ int64_t increment = 0;
+ if (base_run < target_run) {
+ increment = base_run;
+ physical_base_index += 1; // skip to next run on base
+ } else if (base_run > target_run) {
+ increment = target_run;
+ physical_target_index += 1; // skip to next run on target
+ } else {
+ increment = base_run;
+ // skip to next run on both base and target
+ physical_base_index += 1;
+ physical_target_index += 1;
+ }
+ // Since both base_run and target_run are greater than zero,
+ // increment is also greater than zero...
+ DCHECK_GT(increment, 0);
+ // ...which implies that the loop will make progress and eventually
terminate
+ // because base_index or target_index will equal base_length or
target_length,
+ // respectively.
+ base_index += increment;
+ target_index += increment;
+ // The value representing the two runs are equal, so we can assume that
at
+ // least `increment` (size of smallest run) values are equal.
+ run_length_of_equals += increment;
+
+ if (base_index >= base_length || target_index >= target_length) {
+ break;
+ }
+ }
+
+ return run_length_of_equals;
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+ const int64_t physical_target_index =
FindPhysicalIndexOnTarget(target_index);
+ return inner_value_comparator_->Equals(physical_base_index,
physical_target_index);
+ }
+};
+
+class CreateValueComparator {
+ private:
+ std::unique_ptr<ValueComparator> comparator_;
+
+ public:
template <typename T>
- Status Visit(const T&) {
+ Status Visit(const T&, const Array& base, const Array& target) {
using ArrayType = typename TypeTraits<T>::ArrayType;
- out = [](const Array& base, int64_t base_index, const Array& target,
- int64_t target_index) {
- return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
- GetView(checked_cast<const ArrayType&>(target), target_index));
- };
+ comparator_ = std::make_unique<DefaultValueComparator<ArrayType>>(
+ checked_cast<const ArrayType&>(base), checked_cast<const
ArrayType&>(target));
return Status::OK();
}
- Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+ Status Visit(const NullType&, const Array&, const Array&) {
+ return Status::NotImplemented("null type");
+ }
- Status Visit(const ExtensionType&) { return
Status::NotImplemented("extension type"); }
+ Status Visit(const ExtensionType&, const Array&, const Array&) {
+ return Status::NotImplemented("extension type");
+ }
- Status Visit(const DictionaryType&) {
+ Status Visit(const DictionaryType&, const Array& base, const Array& target) {
return Status::NotImplemented("dictionary type");
}
- Status Visit(const RunEndEncodedType&) {
- return Status::NotImplemented("run-end encoded type");
+ Status Visit(const RunEndEncodedType& ree_type, const Array& base,
+ const Array& target) {
+ const auto& base_ree = checked_cast<const RunEndEncodedArray&>(base);
+ const auto& target_ree = checked_cast<const RunEndEncodedArray&>(target);
+ ARROW_ASSIGN_OR_RAISE(
+ auto inner_values_comparator,
+ (*this)(*ree_type.value_type(), *base_ree.values(),
*target_ree.values()));
+ auto* ree_value_comparator =
+ ([&ree_type, &base_ree, &target_ree,
+ inner_values_comparator =
+ std::move(inner_values_comparator)]() mutable ->
ValueComparator* {
+ const auto run_end_type_id = ree_type.run_end_type()->id();
+ switch (run_end_type_id) {
+ case Type::INT16:
+ return new REEValueComparator<int16_t>(base_ree, target_ree,
+
std::move(inner_values_comparator));
+ case Type::INT32:
+ return new REEValueComparator<int32_t>(base_ree, target_ree,
+
std::move(inner_values_comparator));
+ default:
+ DCHECK_EQ(run_end_type_id, Type::INT64);
+ return new REEValueComparator<int64_t>(base_ree, target_ree,
+
std::move(inner_values_comparator));
+ }
+ }());
+ comparator_ = std::unique_ptr<ValueComparator>{ree_value_comparator};
+ return Status::OK();
}
- ValueComparator Create(const DataType& type) {
- DCHECK_OK(VisitTypeInline(type, this));
- return out;
+ Result<std::unique_ptr<ValueComparator>> operator()(const DataType& type,
Review Comment:
Could this be a static member function named Create instead?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]