This is an automated email from the ASF dual-hosted git repository. bkietz pushed a commit to branch feature/format-string-view in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 8e1c1442feebe9af2db607e50abd4b9bb900e3fb Author: Wes McKinney <[email protected]> AuthorDate: Fri Sep 9 16:35:27 2022 -0500 Draft basic scaffolding for Binary/StringView types and get compiling --- LICENSE.txt | 16 +- cpp/src/arrow/array/array_base.cc | 4 + cpp/src/arrow/array/array_binary.cc | 12 + cpp/src/arrow/array/array_binary.h | 58 +++++ cpp/src/arrow/array/builder_binary.cc | 86 +++++++ cpp/src/arrow/array/builder_binary.h | 248 +++++++++++++++++++++ cpp/src/arrow/array/builder_dict.cc | 6 + cpp/src/arrow/array/builder_dict.h | 10 + cpp/src/arrow/array/concatenate.cc | 4 + cpp/src/arrow/array/util.cc | 13 ++ cpp/src/arrow/array/validate.cc | 20 +- cpp/src/arrow/compare.cc | 13 +- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/metadata_internal.cc | 10 + cpp/src/arrow/ipc/reader.cc | 5 + cpp/src/arrow/ipc/writer.cc | 4 + cpp/src/arrow/json/test_common.h | 10 +- cpp/src/arrow/scalar.cc | 14 ++ cpp/src/arrow/scalar.h | 29 +++ cpp/src/arrow/testing/json_internal.cc | 10 +- cpp/src/arrow/type.cc | 16 +- cpp/src/arrow/type.h | 46 ++++ cpp/src/arrow/type_fwd.h | 21 ++ cpp/src/arrow/type_test.cc | 12 + cpp/src/arrow/type_traits.h | 57 ++++- cpp/src/arrow/util/string_header.h | 219 ++++++++++++++++++ cpp/src/arrow/visitor.cc | 8 +- cpp/src/arrow/visitor.h | 6 + cpp/src/arrow/visitor_generate.h | 2 + cpp/src/parquet/column_writer.cc | 1 + python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 38 +--- python/pyarrow/src/arrow/python/python_to_arrow.cc | 23 +- 32 files changed, 974 insertions(+), 51 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 86cfaf546c..d282bfe7b3 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1965,7 +1965,7 @@ This project includes code from the autobrew project. The following files are based on code from the autobrew project: * r/tools/autobrew * dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb -* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb +* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb Copyright (c) 2019, Jeroen Ooms License: MIT @@ -2047,6 +2047,20 @@ License: http://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- +This project includes code from Velox. + + * cpp/src/arrow/util/bytes_header.h + +is based on Velox's + + * velox/type/StringView.h + +Copyright: Copyright (c) Facebook, Inc. and its affiliates. +Home page: https://github.com/facebookincubator/velox +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + The file cpp/src/arrow/vendored/musl/strptime.c has the following license Copyright © 2005-2020 Rich Felker, et al. diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 5d27b2aedf..de9ab2e985 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -82,6 +82,10 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { + return Status::NotImplemented("ScalarFromArraySlot -> BinaryView"); + } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f..cfc467160a 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -89,6 +89,18 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(const std::shared_ptr<ArrayData>& data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(data); +} + +StringViewArray::StringViewArray(const std::shared_ptr<ArrayData>& data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(data); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff8..03ee77fab8 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include <cstdint> #include <memory> +#include <optional> #include <string> #include <string_view> #include <vector> @@ -217,6 +218,63 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// StringHeader struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator<BinaryViewArray>; + + explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data); + + BinaryViewArray(int64_t length, const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {} + + const StringHeader* raw_values() const { + return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset; + } + + StringHeader Value(int64_t i) const { return raw_values()[i]; } + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); } + + // EXPERIMENTAL + std::optional<std::string_view> operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using PrimitiveArray::PrimitiveArray; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// StringHeader to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(const std::shared_ptr<ArrayData>& data); + + StringViewArray(int64_t length, const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {} + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab..e0a7bc1193 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -40,6 +40,92 @@ namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView + +Status BinaryViewBuilder::AppendValues(const std::vector<std::string>& values, + const uint8_t* valid_bytes) { + // We only need to allocate memory for the out-of-line strings + std::size_t out_of_line_total = std::accumulate( + values.begin(), values.end(), 0ULL, [](uint64_t sum, const std::string& str) { + size_t length = str.size(); + return sum + (length > StringHeader::kInlineSize ? length : 0); + }); + RETURN_NOT_OK(Reserve(values.size())); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + if (valid_bytes != nullptr) { + for (std::size_t i = 0; i < values.size(); ++i) { + if (valid_bytes[i]) { + UnsafeAppend(values[i]); + } else { + UnsafeAppendNull(); + } + } + } else { + for (std::size_t i = 0; i < values.size(); ++i) { + UnsafeAppend(values[i]); + } + } + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); +} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues<uint8_t>(0, 0); + auto values = array.GetValues<StringHeader>(1) + offset; + + int64_t out_of_line_total = 0; + for (int64_t i = 0; i < length; i++) { + if (!values[i].IsInline()) { + out_of_line_total += static_cast<int64_t>(values[i].size()); + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) { + if (values[i].IsInline()) { + UnsafeAppend(values[i]); + } else { + UnsafeAppend(values[i].data(), values[i].size()); + } + } else { + UnsafeAppendNull(); + } + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + capacity_ = length_ = null_count_ = 0; + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 25183ca169..c716e6d225 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -459,6 +459,254 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr<DataType> type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// The builders permit two styles of use: one where appended data is +// accumulated in a third buffer that is appended to the resulting ArrayData, +// and one where only the StringHeaders are appended. If you only want to +// append StringHeaders, then use the Append(const StringHeader&) methods + +namespace internal { + +// Because we construct StringHeader objects incrementally, resizing buffers is +// not an option as memory addresses for out-of-line strings will change. Thus, +// we allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 1 << 20; // 1MB + + StringHeapBuilder(MemoryPool* pool, int64_t blocksize = kDefaultBlocksize) + : pool_(pool), blocksize_(blocksize) {} + + const uint8_t* UnsafeAppend(const uint8_t* data, int64_t num_bytes) { + memcpy(current_out_buffer_, data, static_cast<size_t>(num_bytes)); + const uint8_t* result = current_out_buffer_; + current_out_buffer_ += num_bytes; + current_remaining_bytes_ -= num_bytes; + return result; + } + + Result<const uint8_t*> Append(const uint8_t* data, int64_t num_bytes) { + if (num_bytes > current_remaining_bytes_) { + ARROW_RETURN_NOT_OK(Reserve(num_bytes)); + } + return UnsafeAppend(data, num_bytes); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (num_bytes > current_remaining_bytes_) { + current_remaining_bytes_ = + num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block, + AllocateBuffer(current_remaining_bytes_, pool_)); + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_out_buffer_ = nullptr; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector<std::shared_ptr<Buffer>> Finish() { + current_out_buffer_ = nullptr; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + const int64_t blocksize_; + std::vector<std::shared_ptr<Buffer>> blocks_; + + uint8_t* current_out_buffer_ = nullptr; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) + : BinaryViewBuilder(pool) {} + + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + if (length > static_cast<int64_t>(StringHeader::kInlineSize)) { + // String is stored out-of-line + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + // Overwrite 'value' since we will use that for the StringHeader value below + ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length)); + } + UnsafeAppend(StringHeader(value, length)); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast<const uint8_t*>(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast<int64_t>(value.size())); + } + + Status Append(StringHeader value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 4GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + if (length > static_cast<int64_t>(StringHeader::kInlineSize)) { + // String is stored out-of-line + // Overwrite 'value' since we will use that for the StringHeader value below + value = data_heap_builder_.UnsafeAppend(value, length); + } + UnsafeAppend(StringHeader(value, length)); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast<int64_t>(value.size())); + } + + void UnsafeAppend(StringHeader value) { + data_builder_.UnsafeAppend(value); + UnsafeAppendToBitmap(true); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, StringHeader()); // zero + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(StringHeader()); // zero + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(StringHeader("")); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, StringHeader("")); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(StringHeader()); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(StringHeader("")); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector<std::string>& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr<ArrayData>* out) override; + + std::shared_ptr<DataType> type() const override { return binary_view(); } + + protected: + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {} + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits<uint32_t>::max(); + } + + TypedBufferBuilder<StringHeader> data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr<DataType> type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 061fb60041..c99a6facee 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -193,6 +193,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert<BinaryType>(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert<BinaryType>(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert<LargeBinaryType>(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf3099..0cc82930a1 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue<T, enable_if_base_binary<T>> { BinaryType, LargeBinaryType>::type; }; +template <typename T> +struct DictionaryValue<T, enable_if_binary_view_like<T>> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template <typename T> struct DictionaryValue<T, enable_if_fixed_size_binary<T>> { using type = std::string_view; @@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + // TODO: Consider working StringHeader throughout the hashing machinery to + // benefit from faster comparisons, reduced need to allocate memory + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); + class DictionaryMemoTableImpl; std::unique_ptr<DictionaryMemoTableImpl> impl_; }; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index aab734284f..3dd0ccea93 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -227,6 +227,10 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType&) { + return Status::NotImplemented("binary / string view"); + } + Status Visit(const ListType&) { std::vector<Range> value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index c0cdcab730..ac9d76d469 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -264,6 +264,14 @@ class ArrayDataEndianSwapper { return Status::OK(); } + template <typename T> + enable_if_t<std::is_same<BinaryViewType, T>::value || + std::is_same<StringViewType, T>::value, + Status> + Visit(const T& type) { + return Status::NotImplemented("Binary / string view"); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets<int32_t>(1)); return Status::OK(); @@ -596,6 +604,11 @@ class RepeatedArrayFactory { return Status::OK(); } + template <typename T> + enable_if_binary_view_like<T, Status> Visit(const T&) { + return Status::NotImplemented("binary / string view"); + } + template <typename T> enable_if_var_size_list<T, Status> Visit(const T& type) { using ScalarType = typename TypeTraits<T>::ScalarType; diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 56470ac74b..cddb086005 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -47,6 +47,19 @@ struct UTF8DataValidator { return Status::NotImplemented(""); } + Status Visit(const StringViewType&) { + util::InitializeUTF8(); + + const auto* values = data.GetValues<StringHeader>(1); + for (int64_t i = 0; i < data.length; ++i) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8( + reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + } + return Status::OK(); + } + template <typename StringType> enable_if_string<StringType, Status> Visit(const StringType&) { util::InitializeUTF8(); @@ -247,6 +260,10 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { + return Status::NotImplemented("binary / string view"); + } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -716,7 +733,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index baadd10cca..8ccc645046 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -259,6 +259,11 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + return Status::NotImplemented("Binary / string view"); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -577,7 +582,7 @@ class TypeEqualsVisitor { template <typename T> enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value || - is_base_binary_type<T>::value, + is_base_binary_type<T>::value || is_binary_view_like_type<T>::value, Status> Visit(const T&) { result_ = true; @@ -729,6 +734,12 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewScalar& left) { + const auto& right = checked_cast<const BinaryViewScalar&>(right_); + result_ = left.value == right.value; + return Status::OK(); + } + Status Visit(const Decimal128Scalar& left) { const auto& right = checked_cast<const Decimal128Scalar&>(right_); result_ = left.value == right.value; diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8..1ef076fac4 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type<T>::value || is_null_type<T>::value || is_decimal_type<T>::value || std::is_same<DictionaryType, T>::value || is_duration_type<T>::value || is_interval_type<T>::value || is_fixed_size_binary_type<T>::value || - std::is_same<Date64Type, T>::value || std::is_same<Time64Type, T>::value || - std::is_same<ExtensionType, T>::value, + is_binary_view_like_type<T>::value || std::is_same<Date64Type, T>::value || + std::is_same<Time64Type, T>::value || std::is_same<ExtensionType, T>::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 2e450b9d46..367b31d5dd 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -523,6 +523,16 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + // BinaryView will be written to IPC as a normal binary array + return Visit(BinaryType()); + } + + Status Visit(const StringViewType& type) { + // StringView will be written to IPC as a normal UTF8 string array + return Visit(StringType()); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index a1b17afaaf..843d5917b3 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -348,6 +348,11 @@ class ArrayLoader { return LoadBinary<T>(type.id()); } + Status Visit(const BinaryViewType& type) { + DCHECK(false); + return Status::NotImplemented("Reading IPC format to binary view is not supported"); + } + Status Visit(const FixedSizeBinaryType& type) { out_->buffers.resize(2); RETURN_NOT_OK(LoadCommon(type.id())); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index b89604e6fe..d68da651f3 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -388,6 +388,10 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + return Status::NotImplemented("Binary / string view type"); + } + Status Visit(const FixedSizeListArray& array) { --max_recursion_depth_; auto size = array.list_type()->list_size(); diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index c01036047c..86a03c82ab 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template <typename T> - enable_if_base_binary<T, Status> Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template <typename T> + enable_if_base_binary<T, Status> Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template <typename T> enable_if_list_like<T, Status> Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 0ca08d7a82..d139845bd7 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -70,6 +70,12 @@ struct ScalarHashImpl { Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); } + Status Visit(const BinaryViewScalar& s) { + const StringHeader& v = s.value; + hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size()); + return Status::OK(); + } + template <typename T> Status Visit(const TemporalScalar<T>& s) { return ValueHash(s); @@ -226,6 +232,14 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { + return Status::NotImplemented("Binary view"); + } + + Status Visit(const StringViewScalar& s) { + return Status::NotImplemented("String view"); + } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template <typename ScalarType> diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index cf852dff36..9b7f604132 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -37,6 +37,7 @@ #include "arrow/type_traits.h" #include "arrow/util/compare.h" #include "arrow/util/decimal.h" +#include "arrow/util/string_header.h" #include "arrow/util/visibility.h" #include "arrow/visit_type_inline.h" @@ -282,6 +283,34 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase { + using internal::PrimitiveScalarBase::PrimitiveScalarBase; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type) + : internal::PrimitiveScalarBase(std::move(type), true), value(value) {} + + explicit BinaryViewScalar(StringHeader value) + : BinaryViewScalar(value, binary_view()) {} + + BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {} + + void* mutable_data() override { return reinterpret_cast<void*>(&this->value); } + + std::string_view view() const override { return std::string_view(this->value); } + + StringHeader value; +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using TypeClass = StringViewType; + + explicit StringViewScalar(StringHeader value) + : BinaryViewScalar(std::move(value), utf8_view()) {} + + StringViewScalar() : BinaryViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc index c1d45aa2e0..a296e0fba7 100644 --- a/cpp/src/arrow/testing/json_internal.cc +++ b/cpp/src/arrow/testing/json_internal.cc @@ -227,8 +227,8 @@ class SchemaWriter { template <typename T> enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value || - is_base_binary_type<T>::value || is_base_list_type<T>::value || - is_struct_type<T>::value> + is_base_binary_type<T>::value || is_binary_view_like_type<T>::value || + is_base_list_type<T>::value || is_struct_type<T>::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -386,6 +386,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8_view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binary_view", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -1320,6 +1322,10 @@ class ArrayReader { return FinishBuilder(&builder); } + Status Visit(const BinaryViewType& type) { + return Status::NotImplemented("Binary / string view"); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ea9525404c..b976260ccd 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -59,10 +59,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -188,7 +192,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -564,10 +570,14 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { return "binary_view"; } + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { return "string_view"; } + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2114,8 +2124,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -2283,8 +2295,10 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) +TYPE_FACTORY(utf8_view, StringViewType) TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) +TYPE_FACTORY(binary_view, BinaryViewType) TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) @@ -2532,7 +2546,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 415aaacf1c..f4e082b3f6 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -33,6 +33,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/macros.h" +#include "arrow/util/string_header.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" // IWYU pragma: keep @@ -686,6 +687,33 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data using +/// StringHeader structs +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "binary_view"; } + + BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {} + + DataTypeLayout layout() const override { + return DataTypeLayout( + {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))}); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {} +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -732,6 +760,24 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + StringViewType() : BinaryViewType(Type::STRING_VIEW) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ba0e635f73..1066d50321 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -108,6 +108,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -123,6 +128,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -405,6 +415,13 @@ struct Type { /// Calendar interval type with three fields. INTERVAL_MONTH_DAY_NANO, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW, + // Leave this at the end MAX_ID }; @@ -446,10 +463,14 @@ ARROW_EXPORT const std::shared_ptr<DataType>& float32(); ARROW_EXPORT const std::shared_ptr<DataType>& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr<DataType>& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr<DataType>& utf8_view(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr<DataType>& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr<DataType>& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr<DataType>& binary_view(); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr<DataType>& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 954ad63c8a..ad0804be8b 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1189,9 +1189,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 5873969066..dcd7c36ba2 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,16 @@ struct TypeTraits<BinaryType> { static inline std::shared_ptr<DataType> type_singleton() { return binary(); } }; +template <> +struct TypeTraits<BinaryViewType> { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = StringHeader; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr<DataType> type_singleton() { return binary_view(); } +}; + template <> struct TypeTraits<LargeBinaryType> { using ArrayType = LargeBinaryArray; @@ -371,6 +381,16 @@ struct TypeTraits<StringType> { static inline std::shared_ptr<DataType> type_singleton() { return utf8(); } }; +template <> +struct TypeTraits<StringViewType> { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = StringHeader; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr<DataType> type_singleton() { return utf8_view(); } +}; + template <> struct TypeTraits<LargeStringType> { using ArrayType = LargeStringArray; @@ -390,6 +410,11 @@ struct CTypeTraits<std::string> : public TypeTraits<StringType> { using ArrowType = StringType; }; +template <> +struct CTypeTraits<StringHeader> : public TypeTraits<BinaryViewType> { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits<const char*> : public CTypeTraits<std::string> {}; @@ -605,9 +630,28 @@ using is_string_type = template <typename T, typename R = void> using enable_if_string = enable_if_t<is_string_type<T>::value, R>; +template <typename T> +using is_binary_view_like_type = std::is_base_of<BinaryViewType, T>; + +template <typename T> +using is_binary_view_type = std::is_same<BinaryViewType, T>; + +template <typename T> +using is_string_view_type = std::is_same<StringViewType, T>; + +template <typename T, typename R = void> +using enable_if_binary_view_like = enable_if_t<is_binary_view_like_type<T>::value, R>; + +template <typename T, typename R = void> +using enable_if_binary_view = enable_if_t<is_binary_view_type<T>::value, R>; + +template <typename T, typename R = void> +using enable_if_string_view = enable_if_t<is_string_view_type<T>::value, R>; + template <typename T> using is_string_like_type = - std::integral_constant<bool, is_base_binary_type<T>::value && T::is_utf8>; + std::integral_constant<bool, (is_base_binary_type<T>::value && T::is_utf8) || + is_string_view_type<T>::value>; template <typename T, typename R = void> using enable_if_string_like = enable_if_t<is_string_like_type<T>::value, R>; @@ -630,10 +674,9 @@ template <typename T, typename R = void> using enable_if_fixed_width_type = enable_if_t<is_fixed_width_type<T>::value, R>; template <typename T> -using is_binary_like_type = - std::integral_constant<bool, (is_base_binary_type<T>::value && - !is_string_like_type<T>::value) || - is_fixed_size_binary_type<T>::value>; +using is_binary_like_type = std::integral_constant< + bool, (is_base_binary_type<T>::value && !is_string_like_type<T>::value) || + is_binary_view_type<T>::value || is_fixed_size_binary_type<T>::value>; template <typename T, typename R = void> using enable_if_binary_like = enable_if_t<is_binary_like_type<T>::value, R>; @@ -786,8 +829,10 @@ using enable_if_has_c_type = enable_if_t<has_c_type<T>::value, R>; template <typename T> using has_string_view = std::integral_constant<bool, std::is_same<BinaryType, T>::value || - std::is_same<LargeBinaryType, T>::value || + std::is_same<BinaryViewType, T>::value || + std::is_same<LargeBinaryType, T>::value || std::is_same<StringType, T>::value || + std::is_same<StringViewType, T>::value || std::is_same<LargeStringType, T>::value || std::is_same<FixedSizeBinaryType, T>::value>; diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h new file mode 100644 index 0000000000..29f378a580 --- /dev/null +++ b/cpp/src/arrow/util/string_header.h @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <cassert> +#include <cstdint> +#include <cstring> +#include <ostream> +#include <string> +#include <string_view> + +namespace arrow { + +// Variable length string or binary with 4 byte prefix and inline optimization +// for small values (12 bytes or fewer). This is similar to std::string_view +// except that the referenced is limited in size to UINT32_MAX and up to the +// first four bytes of the string are copied into the struct. The prefix allows +// failing comparisons early and can reduce the CPU cache working set when +// dealing with short strings. +// +// Short string |----|----|--------| +// ^ ^ ^ +// | | | +// size prefix remaining in-line portion +// +// Long string |----|----|--------| +// ^ ^ ^ +// | | | +// size prefix pointer to out-of-line portion +// +// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +// +// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +struct StringHeader { + public: + using value_type = char; + + static constexpr size_t kPrefixSize = 4; + static constexpr size_t kInlineSize = 12; + + StringHeader() { + static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); + ; + memset(this, 0, sizeof(StringHeader)); + } + + explicit StringHeader(uint32_t size) : size_(size) { + memset(prefix_, 0, kPrefixSize); + value_.data = nullptr; + } + + StringHeader(const char* data, size_t len) : size_(len) { + // TODO: better option than assert? + assert(data || size_ == 0); + if (IsInline()) { + // Zero the inline part. + // this makes sure that inline strings can be compared for equality with 2 + // int64 compares. + memset(prefix_, 0, kPrefixSize); + if (size_ == 0) { + return; + } + // small string: inlined. Zero the last 8 bytes first to allow for whole + // word comparison. + value_.data = nullptr; + memcpy(prefix_, data, size_); + } else { + // large string: store pointer + memcpy(prefix_, data, kPrefixSize); + value_.data = data; + } + } + + StringHeader(const uint8_t* data, int64_t len) + : StringHeader(reinterpret_cast<const char*>(data), static_cast<size_t>(len)) {} + + // Making StringHeader implicitly constructible/convertible from char* and + // string literals, in order to allow for a more flexible API and optional + // interoperability. E.g: + // + // StringHeader bh = "literal"; + // std::optional<BytesView> obh = "literal"; + // + /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {} + + explicit StringHeader(const std::string& value) + : StringHeader(value.data(), value.size()) {} + + explicit StringHeader(const std::string_view& value) + : StringHeader(value.data(), value.size()) {} + + bool IsInline() const { return IsInline(size_); } + + static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } + + const char* data() const { return IsInline() ? prefix_ : value_.data; } + + size_t size() const { return size_; } + + size_t capacity() const { return size_; } + + friend std::ostream& operator<<(std::ostream& os, const StringHeader& header) { + os.write(header.data(), header.size()); + return os; + } + + bool operator==(const StringHeader& other) const { + // Compare lengths and first 4 characters. + if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) { + return false; + } + if (IsInline()) { + // The inline part is zeroed at construction, so we can compare + // a word at a time if data extends past 'prefix_'. + return size_ <= kPrefixSize || InlinedAsInt64() == other.InlinedAsInt64(); + } + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + return memcmp(value_.data + kPrefixSize, other.value_.data + kPrefixSize, + size_ - kPrefixSize) == 0; + } + + bool operator!=(const StringHeader& other) const { return !(*this == other); } + + // Returns 0, if this == other + // < 0, if this < other + // > 0, if this > other + int32_t Compare(const StringHeader& other) const { + if (PrefixAsInt() != other.PrefixAsInt()) { + // The result is decided on prefix. The shorter will be less + // because the prefix is padded with zeros. + return memcmp(prefix_, other.prefix_, kPrefixSize); + } + int32_t size = std::min(size_, other.size_) - kPrefixSize; + if (size <= 0) { + // One ends within the prefix. + return size_ - other.size_; + } + if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) { + int32_t result = memcmp(value_.inlined, other.value_.inlined, size); + return (result != 0) ? result : size_ - other.size_; + } + int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size); + return (result != 0) ? result : size_ - other.size_; + } + + bool operator<(const StringHeader& other) const { return Compare(other) < 0; } + + bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; } + + bool operator>(const StringHeader& other) const { return Compare(other) > 0; } + + bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } + + operator std::string() const { return std::string(data(), size()); } + + std::string GetString() const { return *this; } + + explicit operator std::string_view() const { return std::string_view(data(), size()); } + + const char* begin() const { return data(); } + + const char* end() const { return data() + size(); } + + bool empty() const { return size() == 0; } + + private: + inline int64_t SizeAndPrefixAsInt64() const { + return reinterpret_cast<const int64_t*>(this)[0]; + } + + inline int64_t InlinedAsInt64() const { + return reinterpret_cast<const int64_t*>(this)[1]; + } + + int32_t PrefixAsInt() const { return *reinterpret_cast<const int32_t*>(&prefix_); } + + // We rely on all members being laid out top to bottom . C++ + // guarantees this. + uint32_t size_; + char prefix_[4]; + union { + char inlined[8]; + const char* data; + } value_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index d22efc942e..03381a08a7 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -95,7 +97,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -145,7 +149,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 7f83c9ebab..58330de9d0 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -93,7 +95,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -141,7 +145,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 265c76197a..2c267576ca 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f7898c02d4..e62e34abb0 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -129,6 +129,7 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(FixedSizeList); NOT_IMPLEMENTED_VISIT(Dictionary); NOT_IMPLEMENTED_VISIT(Extension); + NOT_IMPLEMENTED_VISIT(BinaryView); #undef NOT_IMPLEMENTED_VISIT diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index f3cee6c65e..7e48f09889 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -116,39 +116,21 @@ void BufferCapsule_Destructor(PyObject* capsule) { using internal::arrow_traits; using internal::npy_traits; -template <typename T> +template <typename T, typename Enable = void> struct WrapBytes {}; -template <> -struct WrapBytes<StringType> { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyUnicode_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes<LargeStringType> { +template <typename T> +struct WrapBytes<T, enable_if_t<is_string_type<T>::value || + is_string_view_type<T>::value>> { static inline PyObject* Wrap(const char* data, int64_t length) { return PyUnicode_FromStringAndSize(data, length); } }; -template <> -struct WrapBytes<BinaryType> { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes<LargeBinaryType> { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes<FixedSizeBinaryType> { +template <typename T> +struct WrapBytes<T, enable_if_t<is_binary_type<T>::value || + is_binary_view_type<T>::value || + is_fixed_size_binary_type<T>::value>> { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } @@ -1026,7 +1008,9 @@ struct ObjectWriterVisitor { } template <typename Type> - enable_if_t<is_base_binary_type<Type>::value || is_fixed_size_binary_type<Type>::value, + enable_if_t<is_base_binary_type<Type>::value || + is_binary_view_like_type<Type>::value || + is_fixed_size_binary_type<Type>::value, Status> Visit(const Type& type) { auto WrapValue = [](const std::string_view& view, PyObject** out) { diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 9e7f07ef81..3ffff8cf19 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -479,13 +479,17 @@ class PyValue { // The binary-like intermediate representation is PyBytesView because it keeps temporary // python objects alive (non-contiguous memoryview) and stores whether the original - // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // object was unicode encoded or not, which is used for unicode -> bytes coercion if // there is a non-unicode object observed. static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { return view.ParseString(obj); } + static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); + } + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { ARROW_RETURN_NOT_OK(view.ParseString(obj)); @@ -672,12 +676,9 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>:: PyBytesView view_; }; -template <typename T> -class PyPrimitiveConverter<T, enable_if_base_binary<T>> - : public PrimitiveConverter<T, PyConverter> { +template <typename T, typename OffsetType> +class PyBinaryConverter : public PrimitiveConverter<T, PyConverter> { public: - using OffsetType = typename T::offset_type; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { this->primitive_builder_->UnsafeAppendNull(); @@ -701,7 +702,7 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>> Result<std::shared_ptr<Array>> ToArray() override { ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray())); if (observed_binary_) { - // if we saw any non-unicode, cast results to BinaryArray + // if we saw any non-unicode, cast results to BinaryArray/BinaryViewArray auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton(); return array->View(binary_type); } else { @@ -714,6 +715,14 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>> bool observed_binary_ = false; }; +template <typename T> +class PyPrimitiveConverter<T, enable_if_base_binary<T>> + : public PyBinaryConverter<T, typename T::offset_type> {}; + +template <typename T> +class PyPrimitiveConverter<T, enable_if_binary_view_like<T>> + : public PyBinaryConverter<T, int64_t> {}; + template <typename U> class PyDictionaryConverter<U, enable_if_has_c_type<U>> : public DictionaryConverter<U, PyConverter> {
