This is an automated email from the ASF dual-hosted git repository. bkietz pushed a commit to branch feature/format-string-view in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 94fcb95927b1edc0c9805b674e4df6c5a31f9277 Author: Benjamin Kietzman <[email protected]> AuthorDate: Tue Nov 15 14:36:15 2022 -0500 fixes in substrait, rename in LICENSE, owning scalars --- LICENSE.txt | 2 +- cpp/src/arrow/array/builder_binary.h | 3 +- .../arrow/engine/substrait/expression_internal.cc | 9 ++++ cpp/src/arrow/engine/substrait/type_internal.cc | 7 +++ cpp/src/arrow/scalar.cc | 9 ---- cpp/src/arrow/scalar.h | 32 ++++-------- cpp/src/arrow/util/string_header.h | 60 ++++++++++------------ 7 files changed, 55 insertions(+), 67 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index d282bfe7b3..02ac840980 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2049,7 +2049,7 @@ License: http://www.apache.org/licenses/LICENSE-2.0 This project includes code from Velox. - * cpp/src/arrow/util/bytes_header.h + * cpp/src/arrow/util/string_header.h is based on Velox's diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index ca5209b81e..b9d926cb16 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -504,8 +504,7 @@ class ARROW_EXPORT StringHeapBuilder { /// UnsafeAppend operations without the need to allocate more memory Status Reserve(int64_t num_bytes) { if (num_bytes > current_remaining_bytes_) { - current_remaining_bytes_ = - num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize; + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block, AllocateBuffer(current_remaining_bytes_, pool_)); current_out_buffer_ = new_block->mutable_data(); diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index b8b545febc..19ec354cf6 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -606,6 +606,15 @@ struct ScalarToProtoImpl { s); } + Status Visit(const StringViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); }, + s); + } + Status Visit(const BinaryViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); }, + s); + } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 16032df67d..e6b0b41a14 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -256,6 +256,13 @@ struct DataTypeToProtoImpl { return SetWith(&::substrait::Type::set_allocated_binary); } + Status Visit(const StringViewType& t) { + return SetWith(&::substrait::Type::set_allocated_string); + } + Status Visit(const BinaryViewType& t) { + return SetWith(&::substrait::Type::set_allocated_binary); + } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&::substrait::Type::set_allocated_fixed_binary) ->set_length(t.byte_width()); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index d139845bd7..bfe8a49a9e 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -70,12 +70,6 @@ struct ScalarHashImpl { Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); } - Status Visit(const BinaryViewScalar& s) { - const StringHeader& v = s.value; - hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size()); - return Status::OK(); - } - template <typename T> Status Visit(const TemporalScalar<T>& s) { return ValueHash(s); @@ -508,9 +502,6 @@ Status Scalar::ValidateFull() const { BinaryScalar::BinaryScalar(std::string s) : BinaryScalar(Buffer::FromString(std::move(s))) {} -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - LargeBinaryScalar::LargeBinaryScalar(std::string s) : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 9b7f604132..9f41ad0975 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -251,7 +251,6 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} }; @@ -260,9 +259,6 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr<Buffer> value) : BinaryScalar(std::move(value), binary()) {} @@ -278,37 +274,29 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr<Buffer> value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); - StringScalar() : StringScalar(utf8()) {} }; -struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase { - using internal::PrimitiveScalarBase::PrimitiveScalarBase; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryViewType; - explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type) - : internal::PrimitiveScalarBase(std::move(type), true), value(value) {} - - explicit BinaryViewScalar(StringHeader value) - : BinaryViewScalar(value, binary_view()) {} - - BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {} - - void* mutable_data() override { return reinterpret_cast<void*>(&this->value); } + explicit BinaryViewScalar(std::shared_ptr<Buffer> value) + : BinaryViewScalar(std::move(value), binary_view()) {} - std::string_view view() const override { return std::string_view(this->value); } + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} - StringHeader value; + std::string_view view() const override { return std::string_view(*this->value); } }; struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; using TypeClass = StringViewType; - explicit StringViewScalar(StringHeader value) - : BinaryViewScalar(std::move(value), utf8_view()) {} + explicit StringViewScalar(std::shared_ptr<Buffer> value) + : StringViewScalar(std::move(value), utf8_view()) {} - StringViewScalar() : BinaryViewScalar(utf8_view()) {} + StringViewScalar() : StringViewScalar(utf8_view()) {} }; struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h index 29f378a580..8ba18a8366 100644 --- a/cpp/src/arrow/util/string_header.h +++ b/cpp/src/arrow/util/string_header.h @@ -33,6 +33,7 @@ #pragma once +#include <array> #include <cassert> #include <cstdint> #include <cstring> @@ -69,35 +70,27 @@ struct StringHeader { static constexpr size_t kPrefixSize = 4; static constexpr size_t kInlineSize = 12; - StringHeader() { - static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); - ; - memset(this, 0, sizeof(StringHeader)); - } + StringHeader() = default; - explicit StringHeader(uint32_t size) : size_(size) { - memset(prefix_, 0, kPrefixSize); - value_.data = nullptr; + static StringHeader makeInline(uint32_t size, char** data) { + assert(size <= kInlineSize); + StringHeader s; + s.size_ = size; + *data = const_cast<char*>(s.data()); + return s; } - StringHeader(const char* data, size_t len) : size_(len) { + StringHeader(const char* data, size_t len) : size_(static_cast<uint32_t>(len)) { + if (size_ == 0) return; + // TODO: better option than assert? - assert(data || size_ == 0); + assert(data); if (IsInline()) { - // Zero the inline part. - // this makes sure that inline strings can be compared for equality with 2 - // int64 compares. - memset(prefix_, 0, kPrefixSize); - if (size_ == 0) { - return; - } - // small string: inlined. Zero the last 8 bytes first to allow for whole - // word comparison. - value_.data = nullptr; - memcpy(prefix_, data, size_); + // small string: inlined. Bytes beyond size_ are already 0 + memcpy(prefix_.data(), data, size_); } else { // large string: store pointer - memcpy(prefix_, data, kPrefixSize); + memcpy(prefix_.data(), data, kPrefixSize); value_.data = data; } } @@ -112,19 +105,20 @@ struct StringHeader { // StringHeader bh = "literal"; // std::optional<BytesView> obh = "literal"; // - /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {} + // NOLINTNEXTLINE runtime/explicit + StringHeader(const char* data) : StringHeader(data, strlen(data)) {} explicit StringHeader(const std::string& value) : StringHeader(value.data(), value.size()) {} - explicit StringHeader(const std::string_view& value) + explicit StringHeader(std::string_view value) : StringHeader(value.data(), value.size()) {} bool IsInline() const { return IsInline(size_); } static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } - const char* data() const { return IsInline() ? prefix_ : value_.data; } + const char* data() const { return IsInline() ? prefix_.data() : value_.data; } size_t size() const { return size_; } @@ -160,7 +154,7 @@ struct StringHeader { if (PrefixAsInt() != other.PrefixAsInt()) { // The result is decided on prefix. The shorter will be less // because the prefix is padded with zeros. - return memcmp(prefix_, other.prefix_, kPrefixSize); + return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize); } int32_t size = std::min(size_, other.size_) - kPrefixSize; if (size <= 0) { @@ -168,7 +162,7 @@ struct StringHeader { return size_ - other.size_; } if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) { - int32_t result = memcmp(value_.inlined, other.value_.inlined, size); + int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size); return (result != 0) ? result : size_ - other.size_; } int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size); @@ -183,9 +177,7 @@ struct StringHeader { bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } - operator std::string() const { return std::string(data(), size()); } - - std::string GetString() const { return *this; } + std::string GetString() const { return std::string(data(), size()); } explicit operator std::string_view() const { return std::string_view(data(), size()); } @@ -208,12 +200,14 @@ struct StringHeader { // We rely on all members being laid out top to bottom . C++ // guarantees this. - uint32_t size_; - char prefix_[4]; + uint32_t size_ = 0; + std::array<char, 4> prefix_ = {0}; union { - char inlined[8]; + std::array<char, 8> inlined = {0}; const char* data; } value_; }; +static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); + } // namespace arrow
