bkietz commented on a change in pull request #7311:
URL: https://github.com/apache/arrow/pull/7311#discussion_r433297868
##########
File path: cpp/src/arrow/array/diff.cc
##########
@@ -181,17 +146,37 @@ internal::LazyRange<NullOrViewGenerator<ArrayType>>
MakeNullOrViewRange(
/// representation is minimal in the common case where the sequences differ
only slightly,
/// since most of the elements are shared between base and target and are
represented
/// implicitly.
-template <typename Iterator>
class QuadraticSpaceMyersDiff {
public:
- // represents an intermediate state in the comparison of two arrays
- struct EditPoint {
- Iterator base, target;
+ QuadraticSpaceMyersDiff(const Array& base, const Array& target, MemoryPool*
pool)
+ : base_(base),
+ target_(target),
+ pool_(pool),
+ value_comparator_(GetValueComparator(*base.type())),
+ base_begin_(0),
+ base_end_(base.length()),
+ target_begin_(0),
+ target_end_(target.length()),
+ endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
+ insert_({true}) {
+ if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
+ endpoint_base_[0] == base_end_) {
+ // trivial case: base == target
+ finish_index_ = 0;
+ }
+ }
- bool operator==(EditPoint other) const {
- return base == other.base && target == other.target;
+ bool ValuesEqual(int64_t base_index, int64_t target_index) const {
+ bool base_null = base_.IsNull(base_index);
+ bool target_null = target_.IsNull(target_index);
+ if (base_null && target_null) {
+ return true;
+ } else if (base_null || target_null) {
Review comment:
Coding style: don't use `else` if the previous block returns
##########
File path: cpp/src/arrow/array/diff.cc
##########
@@ -342,106 +311,74 @@ class QuadraticSpaceMyersDiff {
{field("insert", boolean()), field("run_length", int64())});
}
+ Result<std::shared_ptr<StructArray>> Diff() {
+ while (!Done()) {
+ Next();
+ }
+ return GetEdits(pool_);
+ }
+
private:
+ const Array& base_;
+ const Array& target_;
+ MemoryPool* pool_;
+ ValueComparator value_comparator_;
int64_t finish_index_ = -1;
int64_t edit_count_ = 0;
- Iterator base_begin_, base_end_;
- Iterator target_begin_, target_end_;
+ int64_t base_begin_, base_end_;
+ int64_t target_begin_, target_end_;
// each element of endpoint_base_ is the furthest position in base reachable
given an
// edit_count and (# insertions) - (# deletions). Each bit of insert_
records whether
// the corresponding furthest position was reached via an insertion or a
deletion
// (followed by a run of shared elements). See StorageOffset for the
// layout of these vectors
- std::vector<Iterator> endpoint_base_;
+ std::vector<int64_t> endpoint_base_;
std::vector<bool> insert_;
};
-struct DiffImpl {
- Status Visit(const NullType&) {
- bool insert = base_.length() < target_.length();
- auto run_length = std::min(base_.length(), target_.length());
- auto edit_count = std::max(base_.length(), target_.length()) - run_length;
-
- TypedBufferBuilder<bool> insert_builder(pool_);
- RETURN_NOT_OK(insert_builder.Resize(edit_count + 1));
- insert_builder.UnsafeAppend(false);
- TypedBufferBuilder<int64_t> run_length_builder(pool_);
- RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1));
- run_length_builder.UnsafeAppend(run_length);
- if (edit_count > 0) {
- insert_builder.UnsafeAppend(edit_count, insert);
- run_length_builder.UnsafeAppend(edit_count, 0);
- }
-
- std::shared_ptr<Buffer> insert_buf, run_length_buf;
- RETURN_NOT_OK(insert_builder.Finish(&insert_buf));
- RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf));
-
- ARROW_ASSIGN_OR_RAISE(
- out_,
- StructArray::Make({std::make_shared<BooleanArray>(edit_count + 1,
insert_buf),
- std::make_shared<Int64Array>(edit_count + 1,
run_length_buf)},
- {field("insert", boolean()), field("run_length",
int64())}));
- return Status::OK();
- }
-
- template <typename T>
- Status Visit(const T&) {
- using ArrayType = typename TypeTraits<T>::ArrayType;
- if (base_.null_count() == 0 && target_.null_count() == 0) {
- auto base = MakeViewRange<ArrayType>(base_);
- auto target = MakeViewRange<ArrayType>(target_);
- ARROW_ASSIGN_OR_RAISE(out_,
- Diff(base.begin(), base.end(), target.begin(),
target.end()));
- } else {
- auto base = MakeNullOrViewRange<ArrayType>(base_);
- auto target = MakeNullOrViewRange<ArrayType>(target_);
- ARROW_ASSIGN_OR_RAISE(out_,
- Diff(base.begin(), base.end(), target.begin(),
target.end()));
- }
- return Status::OK();
- }
-
- Status Visit(const ExtensionType&) {
- auto base = checked_cast<const ExtensionArray&>(base_).storage();
- auto target = checked_cast<const ExtensionArray&>(target_).storage();
- ARROW_ASSIGN_OR_RAISE(out_, arrow::Diff(*base, *target, pool_));
- return Status::OK();
- }
-
- Status Visit(const DictionaryType& t) {
- return Status::NotImplemented("diffing arrays of type ", t);
- }
-
- Result<std::shared_ptr<StructArray>> Diff() {
- RETURN_NOT_OK(VisitTypeInline(*base_.type(), this));
- return out_;
- }
-
- template <typename Iterator>
- Result<std::shared_ptr<StructArray>> Diff(Iterator base_begin, Iterator
base_end,
- Iterator target_begin, Iterator
target_end) {
- QuadraticSpaceMyersDiff<Iterator> impl(base_begin, base_end, target_begin,
- target_end);
- while (!impl.Done()) {
- impl.Next();
- }
- return impl.GetEdits(pool_);
- }
-
- const Array& base_;
- const Array& target_;
- MemoryPool* pool_;
- std::shared_ptr<StructArray> out_;
-};
+Result<std::shared_ptr<StructArray>> NullDiff(const Array& base, const Array&
target,
+ MemoryPool* pool) {
+ bool insert = base.length() < target.length();
+ auto run_length = std::min(base.length(), target.length());
+ auto edit_count = std::max(base.length(), target.length()) - run_length;
+
+ TypedBufferBuilder<bool> insert_builder(pool);
+ RETURN_NOT_OK(insert_builder.Resize(edit_count + 1));
+ insert_builder.UnsafeAppend(false);
+ TypedBufferBuilder<int64_t> run_length_builder(pool);
+ RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1));
+ run_length_builder.UnsafeAppend(run_length);
+ if (edit_count > 0) {
+ insert_builder.UnsafeAppend(edit_count, insert);
+ run_length_builder.UnsafeAppend(edit_count, 0);
+ }
+
+ std::shared_ptr<Buffer> insert_buf, run_length_buf;
+ RETURN_NOT_OK(insert_builder.Finish(&insert_buf));
+ RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf));
+
+ return StructArray::Make({std::make_shared<BooleanArray>(edit_count + 1,
insert_buf),
+ std::make_shared<Int64Array>(edit_count + 1,
run_length_buf)},
+ {field("insert", boolean()), field("run_length",
int64())});
+}
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array&
target,
MemoryPool* pool) {
if (!base.type()->Equals(target.type())) {
return Status::TypeError("only taking the diff of like-typed arrays is
supported.");
}
- return DiffImpl{base, target, pool, nullptr}.Diff();
+ if (base.type()->id() == Type::NA) {
+ return NullDiff(base, target, pool);
+ } else if (base.type()->id() == Type::EXTENSION) {
+ auto base_storage = checked_cast<const ExtensionArray&>(base).storage();
+ auto target_storage = checked_cast<const
ExtensionArray&>(target).storage();
+ return Diff(*base_storage, *target_storage, pool);
+ } else if (base.type()->id() == Type::EXTENSION) {
Review comment:
```suggestion
} else if (base.type()->id() == Type::DICTIONARY) {
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]