This is an automated email from the ASF dual-hosted git repository. bkietz pushed a commit to branch feature/format-string-view in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 04893f65e92f57bba7b8ff0bbc201dfd17ff3aa0 Author: Benjamin Kietzman <[email protected]> AuthorDate: Fri Nov 18 17:04:47 2022 -0500 Adding comparison and concatenation --- cpp/src/arrow/array/builder_binary.h | 12 ++++++++---- cpp/src/arrow/array/concatenate.cc | 24 +++++++++++++++++++++++- cpp/src/arrow/array/concatenate_test.cc | 8 ++++++++ cpp/src/arrow/compare.cc | 8 +++++++- cpp/src/arrow/testing/random.cc | 14 +++++++++++--- cpp/src/arrow/testing/random.h | 16 ++++++++++++++++ 6 files changed, 73 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 30ab4b9d4a..ccfcb8b2b2 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -542,9 +542,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { public: using TypeClass = BinaryViewType; - BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool) : BinaryViewBuilder(pool) {} + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool) {} + int64_t current_block_bytes_remaining() const { return data_heap_builder_.current_remaining_bytes(); } @@ -683,9 +690,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { std::shared_ptr<DataType> type() const override { return binary_view(); } protected: - explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {} - static constexpr int64_t ValueSizeLimit() { return std::numeric_limits<uint32_t>::max(); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 3dd0ccea93..6f7d61283e 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -228,7 +228,29 @@ class ConcatenateImpl { } Status Visit(const BinaryViewType&) { - return Status::NotImplemented("binary / string view"); + bool any_opted_out_of_view_validation = false; + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + auto begin = in_data->buffers.begin() + 2; + auto end = in_data->buffers.end(); + + if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) { + any_opted_out_of_view_validation = true; + --end; + } + + for (auto it = begin; it != end; ++it) { + out_->buffers.push_back(*it); + } + } + + if (any_opted_out_of_view_validation) { + out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers)); + } + + ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader))); + return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]); } Status Visit(const ListType&) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index bff5d7eec1..1bc0c65bec 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -91,6 +91,7 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr<Array> array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); auto slices = this->Slices(array, offsets); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); @@ -154,6 +155,13 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK((**out).ValidateFull()); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) { *out = diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 68250f0288..5d1c3294c0 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -261,7 +261,13 @@ class RangeDataEqualsImpl { // Also matches StringViewType Status Visit(const BinaryViewType& type) { - return Status::NotImplemented("Binary / string view"); + auto* left_values = left_.GetValues<StringHeader>(1) + left_start_idx_; + auto* right_values = right_.GetValues<StringHeader>(1) + right_start_idx_; + VisitValidRuns([&](int64_t i, int64_t length) { + return std::equal(left_values + i, left_values + i + length, + right_values + i, right_values + i + length); + }); + return Status::OK(); } // Also matches LargeStringType diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 3213273474..e45e296ff6 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -362,13 +362,12 @@ std::shared_ptr<Array> RandomArrayGenerator::Decimal256(std::shared_ptr<DataType return gen.MakeRandomArray(size, null_probability, alignment, memory_pool); } -template <typename TypeClass> +template <typename TypeClass, typename offset_type = typename TypeClass::offset_type> static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; using BuilderType = typename TypeTraits<TypeClass>::BuilderType; using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType; @@ -386,7 +385,7 @@ static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector<uint8_t> str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -429,6 +428,15 @@ std::shared_ptr<Array> RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr<Array> RandomArrayGenerator::StringView(int64_t size, int32_t min_length, + int32_t max_length, + double null_probability, + int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray<StringViewType, uint32_t>(this, size, min_length, max_length, + null_probability, alignment, memory_pool); +} + std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index b2e3a609a2..5b905896f2 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] null_probability the probability of a value being null + /// + /// \return a generated Array + std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate
