This is an automated email from the ASF dual-hosted git repository. bkietz pushed a commit to branch feature/format-string-view in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 7801b484864619a1ef0c7a4ebd8b0d27bfe231d3 Author: Benjamin Kietzman <[email protected]> AuthorDate: Tue Nov 29 16:02:29 2022 -0500 add cast to/from string_view --- cpp/src/arrow/array/data.cc | 13 +- cpp/src/arrow/array/data.h | 21 +- cpp/src/arrow/array/validate.cc | 62 +++-- cpp/src/arrow/compute/exec.cc | 3 + .../arrow/compute/kernels/scalar_cast_internal.cc | 12 +- .../arrow/compute/kernels/scalar_cast_internal.h | 3 - .../arrow/compute/kernels/scalar_cast_numeric.cc | 4 +- .../arrow/compute/kernels/scalar_cast_string.cc | 277 ++++++++++++++------- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 66 +++-- 9 files changed, 300 insertions(+), 161 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 0cfa9fcd2e..eb8249077d 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -145,7 +145,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast<int>(data.buffers.size()), 3); ++i) { const std::shared_ptr<Buffer>& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -291,6 +291,17 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } this->buffers[2].data = const_cast<uint8_t*>(data_buffer); this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast<const BaseBinaryScalar&>(value); + this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space); + if (scalar.is_valid) { + *reinterpret_cast<StringHeader*>(this->buffers[1].data) = {scalar.value->data(), + scalar.value->size()}; + this->buffers[2].data = const_cast<uint8_t*>(scalar.value->data()); + this->buffers[2].size = scalar.value->size(); + } else { + *reinterpret_cast<StringHeader*>(this->buffers[1].data) = {}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast<const BaseBinaryScalar&>(value); this->buffers[1].data = const_cast<uint8_t*>(scalar.value->data()); diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e024483f66..b9991bd959 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -356,10 +356,10 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { - this->null_count = kUnknownNullCount; - } else { + if (this->type->id() == Type::NA) { this->null_count = this->length; + } else if (this->MayHaveNulls()) { + this->null_count = kUnknownNullCount; } } @@ -375,6 +375,21 @@ struct ARROW_EXPORT ArraySpan { namespace internal { +template <typename F> +Status VisitSlices(ArraySpan input, int64_t slice_size, const F& f) { + int64_t num_slices = input.length / slice_size; + int64_t trailing_slice_size = input.length % slice_size; + int64_t offset = input.offset; + + for (int64_t i = 0; i < num_slices; ++i) { + input.SetSlice(offset, slice_size); + ARROW_RETURN_NOT_OK(f(input)); + offset += slice_size; + } + input.SetSlice(offset, trailing_slice_size); + return f(input); +} + void FillZeroLengthArray(const DataType* type, ArraySpan* span); /// Construct a zero-copy view of this ArrayData with the given type. diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 2b83c84b28..4b10951724 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -171,39 +171,29 @@ namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); } - - Status Visit(const StringViewType&) { - util::InitializeUTF8(); - - const auto* values = data.GetValues<StringHeader>(1); - for (int64_t i = 0; i < data.length; ++i) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8( - reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } + template <typename T> + Status Visit(const T&) { + if constexpr (std::is_same_v<T, StringType> || std::is_same_v<T, LargeStringType> || + std::is_same_v<T, StringViewType>) { + util::InitializeUTF8(); + + int64_t i = 0; + return VisitArraySpanInline<T>( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); } - return Status::OK(); - } - - template <typename StringType> - enable_if_string<StringType, Status> Visit(const StringType&) { - util::InitializeUTF8(); - - int64_t i = 0; - return VisitArraySpanInline<StringType>( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); } }; @@ -304,6 +294,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 8d3dcf0f2c..3b868cc716 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -232,6 +232,9 @@ void ComputeDataPreallocate(const DataType& type, case Type::LARGE_LIST: widths->emplace_back(64, /*added_length=*/1); return; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + widths->emplace_back(8 * sizeof(StringHeader)); default: break; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 27a86135a6..8f5467ee84 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -170,12 +170,6 @@ Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) return Status::OK(); } -Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx, - const std::vector<TypeHolder>&) { - const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; - return options.to_type; -} - /// You will see some of kernels with /// /// kOutputTargetType @@ -184,8 +178,10 @@ Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx, /// easiest initial way to get the requested cast type including the TimeUnit /// to the kernel (which is needed to compute the output) was through /// CastOptions - -OutputType kOutputTargetType(ResolveOutputFromOptions); +OutputType kOutputTargetType([](KernelContext* ctx, + const std::vector<TypeHolder>&) -> Result<TypeHolder> { + return CastState::Get(ctx).to_type; +}); Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // TODO(wesm): alternative strategy for zero copy casts after ARROW-16576 diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h index 4d9afab199..bd8f41ea9f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h @@ -71,9 +71,6 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty CastFunction* func); // OutputType::Resolver that returns a type the type from CastOptions -Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx, - const std::vector<TypeHolder>& args); - ARROW_EXPORT extern OutputType kOutputTargetType; // Add generic casts to out_ty from: diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index 00c7cacf9c..e68b08c804 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -726,7 +726,7 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) { } std::shared_ptr<CastFunction> GetCastToDecimal128() { - OutputType sig_out_ty(ResolveOutputFromOptions); + OutputType sig_out_ty = kOutputTargetType; auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128); AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get()); @@ -761,7 +761,7 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() { } std::shared_ptr<CastFunction> GetCastToDecimal256() { - OutputType sig_out_ty(ResolveOutputFromOptions); + OutputType sig_out_ty = kOutputTargetType; auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256); AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 44e233f98c..68ba6268ae 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -25,8 +25,10 @@ #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8_internal.h" #include "arrow/visit_data_inline.h" @@ -284,107 +286,192 @@ Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx, } template <typename O, typename I> -enable_if_base_binary<I, Status> BinaryToBinaryCastExec(KernelContext* ctx, - const ExecSpan& batch, - ExecResult* out) { - const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; +Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { const ArraySpan& input = batch[0].array; - if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) { + // This presupposes that one was created in the invocation layer + ArrayData* output = out->array_data().get(); + output->SetNullCount(input.null_count); + + const auto& options = CastState::Get(ctx); + bool check_utf8 = !I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8; + if (check_utf8) { InitializeUTF8(); - ArraySpanVisitor<I> visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); } - // Start with a zero-copy cast, but change indices to expected size - RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); - return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>( - ctx, input, out->array_data().get()); -} + auto SimpleUtf8Validation = [&] { + if (check_utf8) { + Utf8Validator validator; + return ArraySpanVisitor<I>::Visit(input, &validator); + } + return Status::OK(); + }; -template <typename O, typename I> -enable_if_t<std::is_same<I, FixedSizeBinaryType>::value && - !std::is_same<O, FixedSizeBinaryType>::value, - Status> -BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; - const ArraySpan& input = batch[0].array; + constexpr bool kInputOffsets = + std::is_base_of_v<BinaryType, I> || std::is_base_of_v<LargeBinaryType, I>; - if (O::is_utf8 && !options.allow_invalid_utf8) { - InitializeUTF8(); - ArraySpanVisitor<I> visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); + constexpr bool kInputViews = std::is_base_of_v<BinaryViewType, I>; + + constexpr bool kInputFixed = std::is_same_v<FixedSizeBinaryType, I>; + + constexpr bool kOutputOffsets = + std::is_base_of_v<BinaryType, O> || std::is_base_of_v<LargeBinaryType, O>; + + constexpr bool kOutputViews = std::is_base_of_v<BinaryViewType, O>; + + constexpr bool kOutputFixed = std::is_same_v<FixedSizeBinaryType, O>; + + if constexpr (kInputOffsets && kOutputOffsets) { + // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to + // allocate a new null bitmap if necessary than to always allocate new offsets. + // Start with a zero-copy cast, but change indices to expected size + RETURN_NOT_OK(SimpleUtf8Validation()); + RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); + return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>( + ctx, input, out->array_data().get()); } - // Check for overflow - using output_offset_type = typename O::offset_type; - constexpr output_offset_type kMaxOffset = - std::numeric_limits<output_offset_type>::max(); - const int32_t width = input.type->byte_width(); - const int64_t max_offset = width * input.length; - if (max_offset > kMaxOffset) { - return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", - out->type()->ToString(), ": input array too large"); + if constexpr (kInputViews && kOutputViews) { + return SimpleUtf8Validation() & ZeroCopyCastExec(ctx, batch, out); } - // This presupposes that one was created in the invocation layer - ArrayData* output = out->array_data().get(); + if constexpr (kInputViews && kOutputOffsets) { + // FIXME(bkietz) this discards preallocated offset storage + typename TypeTraits<O>::BuilderType builder{ctx->memory_pool()}; - // Copy buffers over, then generate indices - output->length = input.length; - output->SetNullCount(input.null_count); - if (input.offset == output->offset) { - output->buffers[0] = input.GetBuffer(0); - } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); - } + RETURN_NOT_OK(builder.Reserve(input.length)); + // TODO(bkietz) if ArraySpan::buffers were a SmallVector, we could have access to all + // the character data buffers here and reserve character data accordingly. + + // sweep through L1-sized chunks to reduce the frequency of allocation + int64_t chunk_size = ctx->exec_context()->cpu_info()->CacheSize( + ::arrow::internal::CpuInfo::CacheLevel::L1) / + sizeof(StringHeader) / 4; + + RETURN_NOT_OK(::arrow::internal::VisitSlices( + input, chunk_size, [&](const ArraySpan& input_slice) { + int64_t num_chars = builder.value_data_length(), num_appended_chars = 0; + VisitArraySpanInline<I>( + input_slice, + [&](std::string_view v) { + num_appended_chars += static_cast<int64_t>(v.size()); + }, + [] {}); + + RETURN_NOT_OK(builder.ReserveData(num_appended_chars)); + + VisitArraySpanInline<I>( + input_slice, [&](std::string_view v) { builder.UnsafeAppend(v); }, + [&] { builder.UnsafeAppendNull(); }); + + if (check_utf8) { + if (ARROW_PREDICT_FALSE(!ValidateUTF8Inline(builder.value_data() + num_chars, + num_appended_chars))) { + return Status::Invalid("Invalid UTF8 sequence"); + } + } + return Status::OK(); + })); - // This buffer is preallocated - output_offset_type* offsets = output->GetMutableValues<output_offset_type>(1); - offsets[0] = static_cast<output_offset_type>(input.offset * width); - for (int64_t i = 0; i < input.length; i++) { - offsets[i + 1] = offsets[i] + width; + return builder.FinishInternal(std::get_if<std::shared_ptr<ArrayData>>(&out->value)); } - // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary - // (index 2). After ARROW-16757, we need to copy this memory instead of - // zero-copy it because a Scalar value promoted to an ArraySpan may be - // referencing a temporary buffer whose scope does not extend beyond the - // kernel execution. In that scenario, the validity bitmap above can be - // zero-copied because it points to static memory (either a byte with a 1 or - // a 0 depending on whether the value is null or not). - std::shared_ptr<Buffer> input_data = input.GetBuffer(1); - if (input_data != nullptr) { - ARROW_ASSIGN_OR_RAISE(output->buffers[2], input_data->CopySlice(0, input_data->size(), - ctx->memory_pool())); - } else { - // TODO(wesm): it should already be nullptr, so we may be able to remove - // this - output->buffers[2] = nullptr; + if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) { + // we can reuse the data buffer here and just add views which reference it + if (input.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } + // FIXME(bkietz) segfault due to null buffer owner + // output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2); + + auto* headers = output->buffers[1]->mutable_data_as<StringHeader>(); + if (check_utf8) { + Utf8Validator validator; + return VisitArraySpanInline<I>( + input, + [&](std::string_view v) { + *headers++ = StringHeader{v}; + return validator.VisitValue(v); + }, + [&] { + *headers++ = StringHeader{}; + return Status::OK(); + }); + } else { + VisitArraySpanInline<I>( + input, [&](std::string_view v) { *headers++ = StringHeader{v}; }, + [&] { *headers++ = StringHeader{}; }); + return Status::OK(); + } } - return Status::OK(); -} + if constexpr (kInputFixed && kOutputOffsets) { + RETURN_NOT_OK(SimpleUtf8Validation()); -template <typename O, typename I> -enable_if_t<std::is_same<I, FixedSizeBinaryType>::value && - std::is_same<O, FixedSizeBinaryType>::value, - Status> -BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; - const int32_t in_width = batch[0].type()->byte_width(); - const int32_t out_width = - checked_cast<const FixedSizeBinaryType&>(*options.to_type).byte_width(); - if (in_width != out_width) { - return Status::Invalid("Failed casting from ", batch[0].type()->ToString(), " to ", - options.to_type.ToString(), ": widths must match"); + using output_offset_type = typename O::offset_type; + + int32_t width = input.type->byte_width(); + + if constexpr (std::is_same_v<output_offset_type, int32_t>) { + // Check for overflow + if (width * input.length > std::numeric_limits<int32_t>::max()) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + out->type()->ToString(), ": input array too large"); + } + } + + // Copy buffers over, then generate indices + output->length = input.length; + output->SetNullCount(input.null_count); + if (input.offset == output->offset) { + output->buffers[0] = input.GetBuffer(0); + } else { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } + + // This buffer is preallocated + auto* offsets = output->buffers[1]->mutable_data_as<output_offset_type>(); + offsets[0] = static_cast<output_offset_type>(input.offset * width); + for (int64_t i = 0; i < input.length; i++) { + offsets[i + 1] = offsets[i] + width; + } + + // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary + // (index 2). After ARROW-16757, we need to copy this memory instead of + // zero-copy it because a Scalar value promoted to an ArraySpan may be + // referencing a temporary buffer whose scope does not extend beyond the + // kernel execution. In that scenario, the validity bitmap above can be + // zero-copied because it points to static memory (either a byte with a 1 or + // a 0 depending on whether the value is null or not). + if (std::shared_ptr<Buffer> input_data = input.GetBuffer(1)) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[2], + input_data->CopySlice(0, input_data->size(), ctx->memory_pool())); + } else { + // TODO(wesm): it should already be nullptr, so we may be able to remove + // this + output->buffers[2] = nullptr; + } + + return Status::OK(); } - return ZeroCopyCastExec(ctx, batch, out); + + if constexpr (kInputFixed && kOutputFixed) { + if (input.type->byte_width() != output->type->byte_width()) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + output->type->ToString(), ": widths must match"); + } + return ZeroCopyCastExec(ctx, batch, out); + } + + Unreachable(); } #if defined(_MSC_VER) @@ -447,6 +534,8 @@ template <typename OutType> void AddBinaryToBinaryCast(CastFunction* func) { AddBinaryToBinaryCast<OutType, StringType>(func); AddBinaryToBinaryCast<OutType, BinaryType>(func); + AddBinaryToBinaryCast<OutType, StringViewType>(func); + AddBinaryToBinaryCast<OutType, BinaryViewType>(func); AddBinaryToBinaryCast<OutType, LargeStringType>(func); AddBinaryToBinaryCast<OutType, LargeBinaryType>(func); AddBinaryToBinaryCast<OutType, FixedSizeBinaryType>(func); @@ -459,6 +548,11 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() { AddCommonCasts(Type::BINARY, binary(), cast_binary.get()); AddBinaryToBinaryCast<BinaryType>(cast_binary.get()); + auto cast_binary_view = + std::make_shared<CastFunction>("cast_binary_view", Type::BINARY_VIEW); + AddCommonCasts(Type::BINARY_VIEW, binary_view(), cast_binary_view.get()); + AddBinaryToBinaryCast<BinaryViewType>(cast_binary_view.get()); + auto cast_large_binary = std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY); AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get()); @@ -471,6 +565,14 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() { AddTemporalToStringCasts<StringType>(cast_string.get()); AddBinaryToBinaryCast<StringType>(cast_string.get()); + auto cast_string_view = + std::make_shared<CastFunction>("cast_string_view", Type::STRING_VIEW); + AddCommonCasts(Type::STRING_VIEW, utf8_view(), cast_string_view.get()); + AddNumberToStringCasts<StringViewType>(cast_string_view.get()); + AddDecimalToStringCasts<StringViewType>(cast_string_view.get()); + AddTemporalToStringCasts<StringViewType>(cast_string_view.get()); + AddBinaryToBinaryCast<StringViewType>(cast_string_view.get()); + auto cast_large_string = std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING); AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get()); @@ -481,15 +583,16 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() { auto cast_fsb = std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY); - AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions), - cast_fsb.get()); + AddCommonCasts(Type::FIXED_SIZE_BINARY, kOutputTargetType, cast_fsb.get()); DCHECK_OK(cast_fsb->AddKernel( - Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, - OutputType(FirstType), + Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, kOutputTargetType, BinaryToBinaryCastExec<FixedSizeBinaryType, FixedSizeBinaryType>, NullHandling::COMPUTED_NO_PREALLOCATE)); - return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb}; + return { + cast_binary, cast_binary_view, cast_large_binary, cast_string, + cast_string_view, cast_large_string, cast_fsb, + }; } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 85da81357b..9c10b85b3c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -145,7 +145,7 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input, using arrow::internal::Bitmap; Bitmap is_valid(masked->buffers[0], 0, input->length()); - if (auto original = input->null_bitmap()) { + if (const auto& original = input->null_bitmap()) { is_valid.CopyFrom(Bitmap(original, input->offset(), input->length())); } else { is_valid.SetBitsTo(true); @@ -154,7 +154,7 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input, for (int i : indices_to_mask) { is_valid.SetBitTo(i, false); } - return MakeArray(masked); + return MakeArray(std::move(masked)); } TEST(Cast, CanCast) { @@ -167,6 +167,9 @@ TEST(Cast, CanCast) { } }; + ExpectCanCast(boolean(), {utf8()}); + return; + auto ExpectCannotCast = [ExpectCanCast](std::shared_ptr<DataType> from, std::vector<std::shared_ptr<DataType>> to_set) { ExpectCanCast(from, to_set, /*expected=*/false); @@ -198,17 +201,21 @@ TEST(Cast, CanCast) { ExpectCannotCast(from_numeric, {null()}); } - for (auto from_base_binary : kBaseBinaryTypes) { - ExpectCanCast(from_base_binary, {boolean()}); - ExpectCanCast(from_base_binary, kNumericTypes); - ExpectCanCast(from_base_binary, kBaseBinaryTypes); - ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary}); + auto base_binary_and_view_types = kBaseBinaryTypes; + base_binary_and_view_types.push_back(binary_view()); + base_binary_and_view_types.push_back(utf8_view()); + + for (auto from : base_binary_and_view_types) { + ExpectCanCast(from, {boolean()}); + ExpectCanCast(from, kNumericTypes); + ExpectCanCast(from, base_binary_and_view_types); + ExpectCanCast(dictionary(int64(), from), {from}); // any cast which is valid for the dictionary is valid for the DictionaryArray - ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes); - ExpectCanCast(dictionary(int16(), from_base_binary), kNumericTypes); + ExpectCanCast(dictionary(uint32(), from), kBaseBinaryTypes); + ExpectCanCast(dictionary(int16(), from), kNumericTypes); - ExpectCannotCast(from_base_binary, {null()}); + ExpectCannotCast(from, {null()}); } ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)}); @@ -1029,7 +1036,7 @@ TEST(Cast, DecimalToFloating) { } TEST(Cast, DecimalToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"), ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])")); @@ -1540,7 +1547,7 @@ TEST(Cast, TimeZeroCopy) { } TEST(Cast, DateToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(date32(), "[0, null]"), ArrayFromJSON(string_type, R"(["1970-01-01", null])")); CheckCast(ArrayFromJSON(date64(), "[86400000, null]"), @@ -1549,7 +1556,7 @@ TEST(Cast, DateToString) { } TEST(Cast, TimeToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"), ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])")); CheckCast( @@ -1559,7 +1566,7 @@ TEST(Cast, TimeToString) { } TEST(Cast, TimestampToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); @@ -1585,7 +1592,7 @@ TEST(Cast, TimestampToString) { } TEST_F(CastTimezone, TimestampWithZoneToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, @@ -1771,7 +1778,7 @@ TEST(Cast, DurationToDurationMultiplyOverflow) { } TEST(Cast, DurationToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { for (auto unit : TimeUnit::values()) { CheckCast(ArrayFromJSON(duration(unit), "[0, null, 1234567, 2000]"), ArrayFromJSON(string_type, R"(["0", null, "1234567", "2000"])")); @@ -2008,6 +2015,10 @@ TEST(Cast, StringToTimestamp) { } static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Array> rhs) { + for (auto id : {lhs->type_id(), rhs->type_id()}) { + // views cannot be zero copied + if (id == Type::BINARY_VIEW || id == Type::STRING_VIEW) return; + } // null bitmap and data buffers are always zero-copied AssertBufferSame(*lhs, *rhs, 0); AssertBufferSame(*lhs, *rhs, 2); @@ -2031,8 +2042,9 @@ static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Arr } TEST(Cast, BinaryToString) { - for (auto bin_type : {binary(), large_binary()}) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto bin_type : {binary(), large_binary(), binary_view()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { + ARROW_SCOPED_TRACE(*bin_type, " to ", *string_type); // empty -> empty always works CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]")); @@ -2050,13 +2062,14 @@ TEST(Cast, BinaryToString) { options.allow_invalid_utf8 = true; ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options)); ASSERT_RAISES(Invalid, strings->ValidateFull()); + AssertBinaryZeroCopy(invalid_utf8, strings); } } auto from_type = fixed_size_binary(3); auto invalid_utf8 = FixedSizeInvalidUtf8(from_type); - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(string_type, "[]")); // invalid utf-8 masked by a null bit is not an error @@ -2075,9 +2088,12 @@ TEST(Cast, BinaryToString) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); + if (string_type->id() != Type::STRING_VIEW) { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); + ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); + } } } @@ -2146,7 +2162,7 @@ TEST(Cast, StringToString) { } TEST(Cast, IntToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])")); @@ -2178,7 +2194,7 @@ TEST(Cast, IntToString) { } TEST(Cast, FloatingToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); @@ -2190,7 +2206,7 @@ TEST(Cast, FloatingToString) { } TEST(Cast, BooleanToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"), ArrayFromJSON(string_type, R"(["true", "true", "false", null])")); }
