This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 422174e9b6 GH-35675 [C++] Don't copy the ArraySpan into the REE
ArraySpan (#35677)
422174e9b6 is described below
commit 422174e9b6c926fdb6f55741f1484e9b3e0cb001
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Tue May 30 12:27:52 2023 -0300
GH-35675 [C++] Don't copy the ArraySpan into the REE ArraySpan (#35677)
### Rationale for this change
Copying ArraySpan is not without issues (see #35581), and this change seems
good by itself as it makes it easier for the compiler to SROA [1] the whole
`RunEndEncodedArraySpan` class.
[1]
https://www.llvm.org/docs/Passes.html#sroa-scalar-replacement-of-aggregates
### What changes are included in this PR?
- Make `array_span` private in `ree_util::RunEndEncodedArraySpan`
- Allow construction of `ree_util::RunEndEncodedArraySpan` with separate
`offset` and `length`
- Change `RunEndEncodedBuilder` to avoid a copy of the `ArraySpan` being
iterated over
- Prevent instantiation based on implicit conversion from `ArrayData` to
`ArraySpan`
### Are these changes tested?
Yes, by the existing unit tests of the various uses of
`RunEndEncodedArraySpan`.
### Are there any user-facing changes?
No meaningful changes other than some small tweaks in the interface of a
recently added class.
* Closes: #35675
Authored-by: Felipe Oliveira Carvalho <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/array/builder_run_end.cc | 23 ++++++++---------
cpp/src/arrow/array/builder_run_end.h | 2 +-
cpp/src/arrow/util/ree_util.h | 47 ++++++++++++++++++++++++----------
cpp/src/arrow/util/ree_util_test.cc | 6 +++--
4 files changed, 49 insertions(+), 29 deletions(-)
diff --git a/cpp/src/arrow/array/builder_run_end.cc
b/cpp/src/arrow/array/builder_run_end.cc
index ec285b8e3a..cff8d72952 100644
--- a/cpp/src/arrow/array/builder_run_end.cc
+++ b/cpp/src/arrow/array/builder_run_end.cc
@@ -220,18 +220,21 @@ Status RunEndEncodedBuilder::AppendScalars(const
ScalarVector& scalars) {
}
template <typename RunEndCType>
-Status RunEndEncodedBuilder::DoAppendArray(const ArraySpan& to_append) {
- DCHECK_GT(to_append.length, 0);
+Status RunEndEncodedBuilder::DoAppendArraySlice(const ArraySpan& array,
int64_t offset,
+ int64_t length) {
+ ARROW_DCHECK(offset + length <= array.length);
+ DCHECK_GT(length, 0);
DCHECK(!value_run_builder_->has_open_run());
- ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(to_append);
+ ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(array, array.offset +
offset,
+ length);
const int64_t physical_offset = ree_span.PhysicalIndex(0);
const int64_t physical_length =
ree_span.PhysicalIndex(ree_span.length() - 1) + 1 - physical_offset;
RETURN_NOT_OK(ReservePhysical(physical_length));
- // Append all the run ends from to_append
+ // Append all the run ends from array sliced by offset and length
for (auto it = ree_span.iterator(0, physical_offset); !it.is_end(ree_span);
++it) {
const int64_t run_end = committed_logical_length_ + it.run_length();
RETURN_NOT_OK(DoAppendRunEnd<RunEndCType>(run_end));
@@ -240,14 +243,13 @@ Status RunEndEncodedBuilder::DoAppendArray(const
ArraySpan& to_append) {
// Append all the values directly
RETURN_NOT_OK(value_run_builder_->AppendRunCompressedArraySlice(
- ree_util::ValuesArray(to_append), physical_offset, physical_length));
+ ree_util::ValuesArray(array), physical_offset, physical_length));
return Status::OK();
}
Status RunEndEncodedBuilder::AppendArraySlice(const ArraySpan& array, int64_t
offset,
int64_t length) {
- ARROW_DCHECK(offset + length <= array.length);
ARROW_DCHECK(array.type->Equals(type_));
// Ensure any open run is closed before appending the array slice.
@@ -257,18 +259,15 @@ Status RunEndEncodedBuilder::AppendArraySlice(const
ArraySpan& array, int64_t of
return Status::OK();
}
- ArraySpan to_append = array;
- to_append.SetSlice(array.offset + offset, length);
-
switch (type_->run_end_type()->id()) {
case Type::INT16:
- RETURN_NOT_OK(DoAppendArray<int16_t>(to_append));
+ RETURN_NOT_OK(DoAppendArraySlice<int16_t>(array, offset, length));
break;
case Type::INT32:
- RETURN_NOT_OK(DoAppendArray<int32_t>(to_append));
+ RETURN_NOT_OK(DoAppendArraySlice<int32_t>(array, offset, length));
break;
case Type::INT64:
- RETURN_NOT_OK(DoAppendArray<int64_t>(to_append));
+ RETURN_NOT_OK(DoAppendArraySlice<int64_t>(array, offset, length));
break;
default:
return Status::Invalid("Invalid type for run ends array: ",
type_->run_end_type());
diff --git a/cpp/src/arrow/array/builder_run_end.h
b/cpp/src/arrow/array/builder_run_end.h
index 9764c57992..ac92efbd0d 100644
--- a/cpp/src/arrow/array/builder_run_end.h
+++ b/cpp/src/arrow/array/builder_run_end.h
@@ -273,7 +273,7 @@ class ARROW_EXPORT RunEndEncodedBuilder : public
ArrayBuilder {
// Pre-condition: !value_run_builder_.has_open_run()
template <typename RunEndCType>
- Status DoAppendArray(const ArraySpan& to_append);
+ Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t
length);
template <typename RunEndCType>
Status DoAppendRunEnd(int64_t run_end);
diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h
index 8c8f58a151..e708eb0b59 100644
--- a/cpp/src/arrow/util/ree_util.h
+++ b/cpp/src/arrow/util/ree_util.h
@@ -227,20 +227,39 @@ class RunEndEncodedArraySpan {
int64_t physical_pos_;
};
- explicit RunEndEncodedArraySpan(const ArrayData& data)
- : RunEndEncodedArraySpan(ArraySpan{data}) {}
+ // Prevent implicit ArrayData -> ArraySpan conversion in
+ // RunEndEncodedArraySpan instantiation.
+ explicit RunEndEncodedArraySpan(const ArrayData& data) = delete;
- explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
- : array_span{array_span}, run_ends_(RunEnds<RunEndCType>(array_span)) {
- assert(array_span.type->id() == Type::RUN_END_ENCODED);
+ /// \brief Construct a RunEndEncodedArraySpan from an ArraySpan and new
+ /// absolute offset and length.
+ ///
+ /// RunEndEncodedArraySpan{span, off, len} is equivalent to:
+ ///
+ /// span.SetSlice(off, len);
+ /// RunEndEncodedArraySpan{span}
+ ///
+ /// ArraySpan::SetSlice() updates the null_count to kUnknownNullCount, but
+ /// we don't need that here as REE arrays have null_count set to 0 by
+ /// convention.
+ explicit RunEndEncodedArraySpan(const ArraySpan& array_span, int64_t offset,
+ int64_t length)
+ : array_span_{array_span},
+ run_ends_(RunEnds<RunEndCType>(array_span_)),
+ length_(length),
+ offset_(offset) {
+ assert(array_span_.type->id() == Type::RUN_END_ENCODED);
}
- int64_t length() const { return array_span.length; }
- int64_t offset() const { return array_span.offset; }
+ explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
+ : RunEndEncodedArraySpan(array_span, array_span.offset,
array_span.length) {}
+
+ int64_t offset() const { return offset_; }
+ int64_t length() const { return length_; }
int64_t PhysicalIndex(int64_t logical_pos) const {
- return internal::FindPhysicalIndex(run_ends_,
RunEndsArray(array_span).length,
- logical_pos, offset());
+ return internal::FindPhysicalIndex(run_ends_,
RunEndsArray(array_span_).length,
+ logical_pos, offset_);
}
/// \brief Create an iterator from a logical position and its
@@ -296,9 +315,9 @@ class RunEndEncodedArraySpan {
(length() == 0) ? PhysicalIndex(0) :
PhysicalIndex(length() - 1) + 1);
}
- // Pre-condition: physical_pos < RunEndsArray(array_span).length);
+ // Pre-condition: physical_pos < RunEndsArray(array_span_).length);
inline int64_t run_end(int64_t physical_pos) const {
- assert(physical_pos < RunEndsArray(array_span).length);
+ assert(physical_pos < RunEndsArray(array_span_).length);
// Logical index of the end of the run at physical_pos with offset applied
const int64_t logical_run_end =
std::max<int64_t>(static_cast<int64_t>(run_ends_[physical_pos]) -
offset(), 0);
@@ -306,11 +325,11 @@ class RunEndEncodedArraySpan {
return std::min(logical_run_end, length());
}
- public:
- const ArraySpan array_span;
-
private:
+ const ArraySpan& array_span_;
const RunEndCType* run_ends_;
+ const int64_t length_;
+ const int64_t offset_;
};
/// \brief Iterate over two run-end encoded arrays in runs or sub-runs that are
diff --git a/cpp/src/arrow/util/ree_util_test.cc
b/cpp/src/arrow/util/ree_util_test.cc
index 5f630c3028..966cbd8f38 100644
--- a/cpp/src/arrow/util/ree_util_test.cc
+++ b/cpp/src/arrow/util/ree_util_test.cc
@@ -167,8 +167,10 @@ TYPED_TEST_P(ReeUtilTest, MergedRunsInterator) {
RunEndEncodedArray::Make(2050, right_run_ends,
right_child));
left_array = left_array->Slice(left_parent_offset);
right_array = right_array->Slice(right_parent_offset);
- const RunEndEncodedArraySpan<TypeParam> left_ree_span(*left_array->data());
- const RunEndEncodedArraySpan<TypeParam> right_ree_span(*right_array->data());
+ ArraySpan left_array_span(*left_array->data());
+ ArraySpan right_array_span(*right_array->data());
+ const RunEndEncodedArraySpan<TypeParam> left_ree_span(left_array_span);
+ const RunEndEncodedArraySpan<TypeParam> right_ree_span(right_array_span);
{
ARROW_SCOPED_TRACE("iterate over merged(left, right)");