felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370839520
##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t
index) {
return UnitSlice{&array, index};
}
-using ValueComparator = std::function<bool(const Array&, int64_t, const
Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+ virtual ~ValueComparator() = default;
+
+ /// \brief Compare the validity and values at the given indices in the base
and target
+ /// arrays.
+ ///
+ /// \param base_index The index in the base array.
+ /// \param target_index The index in the target array.
+ /// \return true if the values at the given indices are equal, false
otherwise.
+ /// \pre base_index and target_index are valid indices in their respective
arrays.
+ virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+ /// \brief Return the run length of equal values starting at the given
indices in the
+ /// base and target arrays.
+ ///
+ /// \param base_index The starting index in the base array.
+ /// \param base_length The length of the base array.
+ /// \param target_index The starting index in the target array.
+ /// \param target_length The length of the target array.
+ /// \return The run length of equal values starting at the given indices in
the base
+ /// and target arrays.
+ virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t
base_length,
+ int64_t target_index, int64_t
target_length) {
+ int64_t run_length_of_equals = 0;
+ while (base_index < base_length && target_index < target_length) {
+ if (!Equals(base_index, target_index)) {
+ break;
+ }
+ base_index += 1;
+ target_index += 1;
+ run_length_of_equals += 1;
+ }
+ return run_length_of_equals;
+ }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+ const ArrayType& base;
+ const ArrayType& target;
+
+ DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+ : base(base), target(target) {}
+
+ ~DefaultValueComparator() override = default;
+
+ /// \brief Compare validity bits of base[base_index] and target[target_index]
+ ///
+ /// \return std::nullopt if the validity bits differ, otherwise a bool
+ /// containing the validity bit found in both arrays.
+ static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t
base_index,
+ const ArrayType& target,
+ int64_t target_index) {
+ const bool base_valid = base.IsValid(base_index);
+ const bool target_valid = target.IsValid(target_index);
+ return base_valid ^ target_valid ? std::nullopt :
std::optional<bool>{base_valid};
+ }
+
+ bool Equals(int64_t base_index, int64_t target_index) override {
+ const auto valid = ValidityIfEquals(base, base_index, target,
target_index);
+ if (valid.has_value()) {
+ return *valid ? GetView(base, base_index) == GetView(target,
target_index) : true;
+ }
+ return false;
+ }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+ const RunEndEncodedArray& base_;
+ const RunEndEncodedArray& target_;
+ std::unique_ptr<ValueComparator> inner_value_comparator_;
+ ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+ ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+ REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray&
target,
+ std::unique_ptr<ValueComparator>&& inner_value_comparator)
+ : base_(base),
+ target_(target),
+ inner_value_comparator_(std::move(inner_value_comparator)),
+ base_physical_index_finder_(*base_.data()),
+ target_physical_index_finder_(*target_.data()) {
+ DCHECK_EQ(*base_.type(), *target_.type());
+ }
+
+ ~REEValueComparator() override = default;
-struct ValueComparatorVisitor {
+ private:
+ /// \pre 0 <= i < base_.length()
+ inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+ return base_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ /// \pre 0 <= i < target_.length()
+ inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+ return target_physical_index_finder_.FindPhysicalIndex(i);
+ }
+
+ const RunEndCType* base_run_ends() { return
base_physical_index_finder_.run_ends; }
+
+ const RunEndCType* target_run_ends() { return
target_physical_index_finder_.run_ends; }
+
+ public:
+ int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+ int64_t target_index, int64_t target_length)
override {
+ // Ensure the first search for physical index on the values arrays is safe.
+ if (base_index >= base_length || target_index >= target_length) {
+ // Without values on either side, there is no run of equal values.
+ return 0;
+ }
+
+ // Translate the two logical indices into physical indices.
+ int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+ int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+ int64_t run_length_of_equals = 0;
+ // The loop invariant (base_index < base_length && target_index <
target_length)
+ // is valid when the loop starts because of the check above.
+ for (;;) {
+ const auto base_run_end =
+ static_cast<int64_t>(base_run_ends()[physical_base_index]) -
base_.offset();
+ const auto target_run_end =
+ static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+ target_.offset();
+ // The end of the runs containing the logical indices, by definition,
ends
+ // after the logical indices.
+ DCHECK_LT(base_index, base_run_end);
+ DCHECK_LT(target_index, target_run_end);
+
+ // Compare the physical values that make up the runs containing
base_index
+ // and target_index.
+ if (!inner_value_comparator_->Equals(physical_base_index,
physical_target_index)) {
+ // First difference found, stop because the run of equal values cannot
+ // be extended further.
+ break;
+ }
+
+ const int64_t base_run = std::min(base_run_end, base_length) -
base_index;
+ const int64_t target_run = std::min(target_run_end, target_length) -
target_index;
+ // Due to the loop-invariant (base_index < base_length && target_index <
+ // target_length) and properties of the run-ends asserted above, both
base_run and
+ // target_run are strictly greater than zero.
+ DCHECK_GT(base_run, 0);
+ DCHECK_GT(target_run, 0);
+
+ int64_t increment = 0;
+ if (base_run < target_run) {
+ increment = base_run;
+ physical_base_index += 1; // skip to next run on base
+ } else if (base_run > target_run) {
+ increment = target_run;
+ physical_target_index += 1; // skip to next run on target
+ } else {
+ increment = base_run;
+ // skip to next run on both base and target
+ physical_base_index += 1;
+ physical_target_index += 1;
+ }
Review Comment:
This is very clever, but drops all the comments explaining why each
increment is justified.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]