This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e4bf235d43 GH-36502: [C++] Add run-end encoded array support to
ReferencedByteRanges (#36521)
e4bf235d43 is described below
commit e4bf235d43d9ef64b432db484d6798c9ef366f66
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Tue Jul 11 16:04:54 2023 -0300
GH-36502: [C++] Add run-end encoded array support to ReferencedByteRanges
(#36521)
### Rationale for this change
Fix for #36502.
### What changes are included in this PR?
Fix and C++ tests.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Addition of a new function to the `ree_util` namespace.
* Closes: #36502
Lead-authored-by: Felipe Oliveira Carvalho <[email protected]>
Co-authored-by: Benjamin Kietzman <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/array/array_run_end_test.cc | 24 +++++++++++++++++-
cpp/src/arrow/util/byte_size.cc | 15 +++++++++++
cpp/src/arrow/util/byte_size_test.cc | 27 ++++++++++++++++++++
cpp/src/arrow/util/ree_util.cc | 20 +++++++++++++++
cpp/src/arrow/util/ree_util.h | 41 ++++++++++++++++++++++++-------
5 files changed, 117 insertions(+), 10 deletions(-)
diff --git a/cpp/src/arrow/array/array_run_end_test.cc
b/cpp/src/arrow/array/array_run_end_test.cc
index bc8b929c53..3e8c658726 100644
--- a/cpp/src/arrow/array/array_run_end_test.cc
+++ b/cpp/src/arrow/array/array_run_end_test.cc
@@ -21,6 +21,7 @@
#include <cstdint>
#include <cstring>
#include <memory>
+#include <utility>
#include <vector>
#include "arrow/array.h"
@@ -127,34 +128,55 @@ TEST_P(TestRunEndEncodedArray, FromRunEndsAndValues) {
RunEndEncodedArray::Make(30, run_end_values, ArrayFromJSON(int32(), "[2,
0]")));
}
-TEST_P(TestRunEndEncodedArray, FindOffsetAndLength) {
+TEST_P(TestRunEndEncodedArray, FindPhysicalRange) {
auto run_ends = ArrayFromJSON(run_end_type, "[100, 200, 300, 400, 500]");
auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of",
"REE"])");
ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(500, run_ends,
values));
+ auto Range = [](int64_t offset, int64_t length) -> std::pair<int64_t,
int64_t> {
+ return std::make_pair(offset, length);
+ };
ASSERT_EQ(ree_array->FindPhysicalOffset(), 0);
ASSERT_EQ(ree_array->FindPhysicalLength(), 5);
+ ASSERT_EQ(ree_util::FindPhysicalRange(*ree_array->data(),
ree_array->offset(),
+ ree_array->length()),
+ Range(0, 5));
auto slice =
std::dynamic_pointer_cast<RunEndEncodedArray>(ree_array->Slice(199, 5));
ASSERT_EQ(slice->FindPhysicalOffset(), 1);
ASSERT_EQ(slice->FindPhysicalLength(), 2);
+ ASSERT_EQ(ree_util::FindPhysicalRange(*slice->data(), slice->offset(),
slice->length()),
+ Range(1, 2));
auto slice2 =
std::dynamic_pointer_cast<RunEndEncodedArray>(ree_array->Slice(199, 101));
ASSERT_EQ(slice2->FindPhysicalOffset(), 1);
ASSERT_EQ(slice2->FindPhysicalLength(), 2);
+ ASSERT_EQ(
+ ree_util::FindPhysicalRange(*slice2->data(), slice2->offset(),
slice2->length()),
+ Range(1, 2));
auto slice3 =
std::dynamic_pointer_cast<RunEndEncodedArray>(ree_array->Slice(400, 100));
ASSERT_EQ(slice3->FindPhysicalOffset(), 4);
ASSERT_EQ(slice3->FindPhysicalLength(), 1);
+ ASSERT_EQ(
+ ree_util::FindPhysicalRange(*slice3->data(), slice3->offset(),
slice3->length()),
+ Range(4, 1));
auto slice4 =
std::dynamic_pointer_cast<RunEndEncodedArray>(ree_array->Slice(0, 150));
ASSERT_EQ(slice4->FindPhysicalOffset(), 0);
ASSERT_EQ(slice4->FindPhysicalLength(), 2);
+ ASSERT_EQ(
+ ree_util::FindPhysicalRange(*slice4->data(), slice4->offset(),
slice4->length()),
+ Range(0, 2));
auto zero_length_at_end =
std::dynamic_pointer_cast<RunEndEncodedArray>(ree_array->Slice(500, 0));
ASSERT_EQ(zero_length_at_end->FindPhysicalOffset(), 5);
ASSERT_EQ(zero_length_at_end->FindPhysicalLength(), 0);
+ ASSERT_EQ(ree_util::FindPhysicalRange(*zero_length_at_end->data(),
+ zero_length_at_end->offset(),
+ zero_length_at_end->length()),
+ Range(5, 0));
}
TEST_P(TestRunEndEncodedArray, LogicalRunEnds) {
diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc
index fe232c9acc..548368757c 100644
--- a/cpp/src/arrow/util/byte_size.cc
+++ b/cpp/src/arrow/util/byte_size.cc
@@ -27,6 +27,7 @@
#include "arrow/record_batch.h"
#include "arrow/table.h"
#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
#include "arrow/visit_type_inline.h"
namespace arrow {
@@ -294,6 +295,20 @@ struct GetByteRangesArray {
return Status::OK();
}
+ Status Visit(const RunEndEncodedType& type) const {
+ auto [phys_offset, phys_length] = ree_util::FindPhysicalRange(input,
offset, length);
+ for (int i = 0; i < type.num_fields(); i++) {
+ GetByteRangesArray child{*input.child_data[i],
+ /*offset=*/input.child_data[i]->offset +
phys_offset,
+ /*length=*/phys_length,
+ range_starts,
+ range_offsets,
+ range_lengths};
+ RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &child));
+ }
+ return Status::OK();
+ }
+
Status Visit(const ExtensionType& extension_type) const {
GetByteRangesArray storage{input, offset, length,
range_starts, range_offsets, range_lengths};
diff --git a/cpp/src/arrow/util/byte_size_test.cc
b/cpp/src/arrow/util/byte_size_test.cc
index fc18049fdd..0aaf0a76a2 100644
--- a/cpp/src/arrow/util/byte_size_test.cc
+++ b/cpp/src/arrow/util/byte_size_test.cc
@@ -390,6 +390,33 @@ TEST(ByteRanges, SparseUnion) {
});
}
+TEST(ByteRanges, RunEndEncodedArray) {
+ auto run_ends =
+ ArrayFromJSON(int32(), "[-1, -1, 100, 200, 300, 400, 500,
-1]")->Slice(2, 5);
+ auto values = ArrayFromJSON(int32(), R"([-1, 1, 2, 3, 4, 5, -1,
-1])")->Slice(1, 5);
+ ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(500, run_ends,
values));
+ CheckBufferRanges(ree_array, {
+ {0, 8, 20},
+ {1, 4, 20},
+ });
+ CheckBufferRanges(ree_array->Slice(50), {
+ {0, 8, 20},
+ {1, 4, 20},
+ });
+ CheckBufferRanges(ree_array->Slice(100, 400), {
+ {0, 12, 16},
+ {1, 8, 16},
+ });
+ CheckBufferRanges(ree_array->Slice(100, 301), {
+ {0, 12, 16},
+ {1, 8, 16},
+ });
+ CheckBufferRanges(ree_array->Slice(100, 300), {
+ {0, 12, 12},
+ {1, 8, 12},
+ });
+}
+
TEST(ByteRanges, ExtensionArray) {
std::shared_ptr<Array> ext_arr = ExampleUuid();
CheckBufferRanges(ext_arr, {{0, 0, 1}, {1, 0, 64}});
diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc
index 11da64313a..fcd6c204e0 100644
--- a/cpp/src/arrow/util/ree_util.cc
+++ b/cpp/src/arrow/util/ree_util.cc
@@ -85,6 +85,26 @@ int64_t FindPhysicalLength(const ArraySpan& span) {
return internal::FindPhysicalLength<int64_t>(span);
}
+std::pair<int64_t, int64_t> FindPhysicalRange(const ArraySpan& span, int64_t
offset,
+ int64_t length) {
+ const auto& run_ends_span = RunEndsArray(span);
+ auto type_id = run_ends_span.type->id();
+ if (type_id == Type::INT16) {
+ auto* run_ends = run_ends_span.GetValues<int16_t>(1);
+ return internal::FindPhysicalRange<int16_t>(run_ends,
run_ends_span.length, length,
+ offset);
+ }
+ if (type_id == Type::INT32) {
+ auto* run_ends = run_ends_span.GetValues<int32_t>(1);
+ return internal::FindPhysicalRange<int32_t>(run_ends,
run_ends_span.length, length,
+ offset);
+ }
+ DCHECK_EQ(type_id, Type::INT64);
+ auto* run_ends = run_ends_span.GetValues<int64_t>(1);
+ return internal::FindPhysicalRange<int64_t>(run_ends, run_ends_span.length,
length,
+ offset);
+}
+
namespace {
template <typename RunEndCType>
diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h
index e708eb0b59..5a240240b8 100644
--- a/cpp/src/arrow/util/ree_util.h
+++ b/cpp/src/arrow/util/ree_util.h
@@ -73,24 +73,38 @@ int64_t FindPhysicalIndex(const RunEndCType* run_ends,
int64_t run_ends_size, in
return result;
}
-/// \brief Uses binary-search to calculate the number of physical values (and
+/// \brief Uses binary-search to calculate the range of physical values (and
/// run-ends) necessary to represent the logical range of values from
/// offset to length
+///
+/// \return a pair of physical offset and physical length
template <typename RunEndCType>
-int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size,
- int64_t length, int64_t offset) {
+std::pair<int64_t, int64_t> FindPhysicalRange(const RunEndCType* run_ends,
+ int64_t run_ends_size, int64_t
length,
+ int64_t offset) {
+ const int64_t physical_offset =
+ FindPhysicalIndex<RunEndCType>(run_ends, run_ends_size, 0, offset);
// The physical length is calculated by finding the offset of the last
element
// and adding 1 to it, so first we ensure there is at least one element.
if (length == 0) {
- return 0;
+ return {physical_offset, 0};
}
- const int64_t physical_offset =
- FindPhysicalIndex<RunEndCType>(run_ends, run_ends_size, 0, offset);
const int64_t physical_index_of_last = FindPhysicalIndex<RunEndCType>(
run_ends + physical_offset, run_ends_size - physical_offset, length - 1,
offset);
assert(physical_index_of_last < run_ends_size - physical_offset);
- return physical_index_of_last + 1;
+ return {physical_offset, physical_index_of_last + 1};
+}
+
+/// \brief Uses binary-search to calculate the number of physical values (and
+/// run-ends) necessary to represent the logical range of values from
+/// offset to length
+template <typename RunEndCType>
+int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size,
+ int64_t length, int64_t offset) {
+ auto [_, physical_length] =
+ FindPhysicalRange<RunEndCType>(run_ends, run_ends_size, length, offset);
+ return physical_length;
}
/// \brief Find the physical index into the values array of the REE ArraySpan
@@ -125,7 +139,8 @@ int64_t FindPhysicalLength(const ArraySpan& span) {
/// \brief Find the physical index into the values array of the REE ArraySpan
///
/// This function uses binary-search, so it has a O(log N) cost.
-int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t
absolute_offset);
+ARROW_EXPORT int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i,
+ int64_t absolute_offset);
/// \brief Find the physical length of an REE ArraySpan
///
@@ -136,7 +151,15 @@ int64_t FindPhysicalIndex(const ArraySpan& span, int64_t
i, int64_t absolute_off
/// Avoid calling this function if the physical length can be estabilished in
/// some other way (e.g. when iterating over the runs sequentially until the
/// end). This function uses binary-search, so it has a O(log N) cost.
-int64_t FindPhysicalLength(const ArraySpan& span);
+ARROW_EXPORT int64_t FindPhysicalLength(const ArraySpan& span);
+
+/// \brief Find the physical range of physical values referenced by the REE in
+/// the logical range from offset to offset + length
+///
+/// \return a pair of physical offset and physical length
+ARROW_EXPORT std::pair<int64_t, int64_t> FindPhysicalRange(const ArraySpan&
span,
+ int64_t offset,
+ int64_t length);
template <typename RunEndCType>
class RunEndEncodedArraySpan {