This is an automated email from the ASF dual-hosted git repository. bkietz pushed a commit to branch feature/format-string-view in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 21be7a9a25c59ba89709ccada71d771d1a2cd36b Author: Benjamin Kietzman <[email protected]> AuthorDate: Mon Nov 28 14:09:59 2022 -0500 Extract visitation of views owning buffers --- cpp/src/arrow/array/array_binary_test.cc | 8 + cpp/src/arrow/array/array_test.cc | 52 +++---- cpp/src/arrow/array/validate.cc | 242 ++++++++++++++++++------------- cpp/src/arrow/buffer.h | 8 + cpp/src/arrow/testing/random.cc | 10 ++ 5 files changed, 190 insertions(+), 130 deletions(-) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 92fc16f775..f21abf681f 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -389,6 +389,14 @@ TEST(StringViewArray, Validate) { .ValidateFull(), Ok()); + // overlapping views and buffers are allowed + EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), + StringHeader(std::string_view{*buffer_s}.substr(5, 5)), + StringHeader(std::string_view{*buffer_s}.substr(9, 4))}, + {buffer_s, SliceBuffer(buffer_s, 1, 1), SliceBuffer(buffer_s, 3, 6)}) + .ValidateFull(), + Ok()); + EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), // if a view points outside the buffers, that is invalid StringHeader("from a galaxy far, far away"), diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index c14d4f21ac..36b274a99a 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -663,32 +663,32 @@ TEST_F(TestArray, TestMakeEmptyArray) { FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))}); std::vector<int8_t> union_type_codes{7, 42}; - std::shared_ptr<DataType> types[] = {null(), - boolean(), - int8(), - uint16(), - int32(), - uint64(), - float64(), - binary(), - large_binary(), - fixed_size_binary(3), - decimal(16, 4), - utf8(), - large_utf8(), - list(utf8()), - list(int64()), - large_list(large_utf8()), - fixed_size_list(utf8(), 3), - fixed_size_list(int64(), 4), - dictionary(int32(), utf8()), - struct_({field("a", utf8()), field("b", int32())}), - sparse_union(union_fields1, union_type_codes), - sparse_union(union_fields2, union_type_codes), - dense_union(union_fields1, union_type_codes), - dense_union(union_fields2, union_type_codes)}; - - for (auto type : types) { + for (auto type : {null(), + boolean(), + int8(), + uint16(), + int32(), + uint64(), + float64(), + binary(), + binary_view(), + large_binary(), + fixed_size_binary(3), + decimal(16, 4), + utf8(), + utf8_view(), + large_utf8(), + list(utf8()), + list(int64()), + large_list(large_utf8()), + fixed_size_list(utf8(), 3), + fixed_size_list(int64(), 4), + dictionary(int32(), utf8()), + struct_({field("a", utf8()), field("b", int32())}), + sparse_union(union_fields1, union_type_codes), + sparse_union(union_fields2, union_type_codes), + dense_union(union_fields1, union_type_codes), + dense_union(union_fields2, union_type_codes)}) { ARROW_SCOPED_TRACE("type = ", type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type)); ASSERT_OK(array->ValidateFull()); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 53836efd97..2b83c84b28 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -30,13 +30,141 @@ #include "arrow/util/decimal.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" #include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { + +/// visitor will be called once for each non-inlined StringHeader. +/// It will be passed the index of each non-inlined StringHeader, +/// as well as a `const shared_ptr<Buffer>&` of the buffer +/// wherein the viewed memory resides, or nullptr if the viewed memory +/// is not in a buffer managed by this array. +template <typename Visitor> +Status VisitNonInlinedViewsAndOwningBuffers(const ArrayData& data, + const Visitor& visitor) { + auto* headers = data.buffers[1]->data_as<StringHeader>(); + + static const std::shared_ptr<Buffer> kNullBuffer = nullptr; + + if (data.buffers.size() == 2 || + (data.buffers.size() == 3 && data.buffers.back() == nullptr)) { + // there are no character buffers, just visit a null buffer + for (int64_t i = 0; i < data.length; ++i) { + if (headers[i].IsInline()) continue; + RETURN_NOT_OK(visitor(i, kNullBuffer)); + } + return Status::OK(); + } + + auto IsSubrangeOf = [](std::string_view super, StringHeader sub) { + return super.data() <= sub.data() && + super.data() + super.size() >= sub.data() + sub.size(); + }; + + std::vector<std::string_view> buffers; + std::vector<const std::shared_ptr<Buffer>*> owning_buffers; + for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) { + if (*it != nullptr) { + buffers.emplace_back(**it); + owning_buffers.push_back(&*it); + } + } + + const int not_found = static_cast<int>(buffers.size()); + + auto DoVisit = [&](auto get_buffer) { + DCHECK(!buffers.empty()); + + // owning_buffers[not_found] points to the null placeholder + owning_buffers.push_back(&kNullBuffer); + + std::string_view buffer_containing_previous_view = buffers.front(); + int buffer_i = 0; + + for (int64_t i = 0; i < data.length; ++i) { + if (headers[i].IsInline()) continue; + + if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, headers[i]))) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + } else { + buffer_i = get_buffer(headers[i]); + if (buffer_i != not_found) { + // if we didn't find a buffer which owns headers[i], we can hope + // that there was just one out of line string and check + // buffer_containing_previous_view next iteration + buffer_containing_previous_view = buffers[buffer_i]; + } + } + + RETURN_NOT_OK(visitor(i, *owning_buffers[buffer_i])); + } + return Status::OK(); + }; + + // Simplest check for view-in-buffer: loop through buffers and check each one. + auto Linear = [&](StringHeader view) { + int i = 0; + for (std::string_view buffer : buffers) { + if (IsSubrangeOf(buffer, view)) return i; + ++i; + } + return not_found; + }; + + if (buffers.size() <= 32) { + // If there are few buffers to search through, sorting/binary search is not + // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. + return DoVisit(Linear); + } + + auto DataPtrLess = [](std::string_view l, std::string_view r) { + return l.data() < r.data(); + }; + + { + auto sort_indices = ArgSort(buffers, DataPtrLess); + Permute(sort_indices, &buffers); + Permute(sort_indices, &owning_buffers); + } + + bool non_overlapping = + buffers.end() != + std::adjacent_find(buffers.begin(), buffers.end(), + [](std::string_view before, std::string_view after) { + return before.data() + before.size() <= after.data(); + }); + if (ARROW_PREDICT_FALSE(!non_overlapping)) { + // Using a binary search with overlapping buffers would not *uniquely* identify + // a potentially-containing buffer. Moreover this should be a fairly rare case + // so optimizing for it seems premature. + return DoVisit(Linear); + } + + // More sophisticated check for view-in-buffer: binary search through the buffers. + return DoVisit([&](StringHeader view) { + // Find the first buffer whose data starts after the data in view- + // only buffers *before* this could contain view. Since we've additionally + // checked that the buffers do not overlap, only the buffer *immediately before* + // this could contain view. + int one_past_potential_super = + static_cast<int>(std::upper_bound(buffers.begin(), buffers.end(), + std::string_view{view}, DataPtrLess) - + buffers.begin()); + + if (one_past_potential_super == 0) return not_found; + + int i = one_past_potential_super - 1; + if (IsSubrangeOf(buffers[i], view)) return i; + + return not_found; + }); +} namespace { @@ -610,109 +738,16 @@ struct ValidateArrayImpl { return Status::OK(); } - auto* headers = data.GetValues<StringHeader>(1); - std::string_view buffer_containing_previous_view; - - auto IsSubrangeOf = [](std::string_view super, std::string_view sub) { - return super.data() <= sub.data() && - super.data() + super.size() >= sub.data() + sub.size(); - }; - - std::vector<std::string_view> buffers; - for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) { - buffers.emplace_back(**it); - } - - auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) { - if constexpr (check_previous_buffer) { - buffer_containing_previous_view = buffers.front(); - } - - for (int64_t i = 0; i < data.length; ++i) { - if (headers[i].IsInline()) continue; - - std::string_view view{headers[i]}; - - if constexpr (check_previous_buffer) { - if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) { - // Fast path: for most string view arrays, we'll have runs - // of views into the same buffer. - continue; - } - } + return VisitNonInlinedViewsAndOwningBuffers( + data, [&](int64_t i, const std::shared_ptr<Buffer>& owner) { + if (ARROW_PREDICT_TRUE(owner != nullptr)) return Status::OK(); - if (!in_a_buffer(view)) { + auto* ptr = data.buffers[1]->data_as<StringHeader>()[i].data(); return Status::Invalid( - "String view at slot ", i, " @", (std::uintptr_t)view.data(), + "String view at slot ", i, " @", + arrow::HexEncode(reinterpret_cast<uint8_t*>(&ptr), sizeof(ptr)), " views memory not resident in any buffer managed by the array"); - } - } - return Status::OK(); - }; - - if (buffers.empty()) { - // there are no character buffers; the only way this array - // can be valid is if all views are inline - return CheckViews([](std::string_view) { return std::false_type{}; }, - /*check_previous_buffer=*/std::false_type{}); - } - - // Simplest check for view-in-buffer: loop through buffers and check each one. - auto Linear = [&](std::string_view view) { - for (std::string_view buffer : buffers) { - if (IsSubrangeOf(buffer, view)) { - buffer_containing_previous_view = buffer; - return true; - } - } - return false; - }; - - if (buffers.size() <= 32) { - // If there are few buffers to search through, sorting/binary search is not - // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. - return CheckViews(Linear, - /*check_previous_buffer=*/std::true_type{}); - } - - auto DataPtrLess = [](std::string_view l, std::string_view r) { - return l.data() < r.data(); - }; - - std::sort(buffers.begin(), buffers.end(), DataPtrLess); - bool non_overlapping = - buffers.end() != - std::adjacent_find(buffers.begin(), buffers.end(), - [](std::string_view before, std::string_view after) { - return before.data() + before.size() <= after.data(); - }); - if (ARROW_PREDICT_FALSE(!non_overlapping)) { - // Using a binary search with overlapping buffers would not *uniquely* identify - // a potentially-containing buffer. Moreover this should be a fairly rare case - // so optimizing for it seems premature. - return CheckViews(Linear, - /*check_previous_buffer=*/std::true_type{}); - } - - // More sophisticated check for view-in-buffer: binary search through the buffers. - return CheckViews( - [&](std::string_view view) { - // Find the first buffer whose data starts after the data in view- - // only buffers *before* this could contain view. Since we've additionally - // checked that the buffers do not overlap, only the buffer *immediately before* - // this could contain view. - auto one_past_potential_super = - std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess); - - if (one_past_potential_super == buffers.begin()) return false; - - auto potential_super = *(one_past_potential_super - 1); - if (!IsSubrangeOf(potential_super, view)) return false; - - buffer_containing_previous_view = potential_super; - return true; - }, - /*check_previous_buffer=*/std::true_type{}); + }); } template <typename ListType> @@ -863,5 +898,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 27b1a1edac..85135df1a3 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -182,6 +182,10 @@ class ARROW_EXPORT Buffer { #endif return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR; } + template <typename T> + const T* data_as() const { + return reinterpret_cast<const T*>(data()); + } /// \brief Return a writable pointer to the buffer's data /// @@ -199,6 +203,10 @@ class ARROW_EXPORT Buffer { return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_) : NULLPTR; } + template <typename T> + T* mutable_data_as() { + return reinterpret_cast<T*>(mutable_data()); + } /// \brief Return the device address of the buffer's data uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); } diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index e45e296ff6..137a5c031a 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -828,6 +828,16 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata<int32_t>(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata<int32_t>(field.metadata().get(), "max_length", 20); + return *StringView(length, min_length, max_length, null_probability) + ->View(field.type()); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool);
