bkietz commented on a change in pull request #10511: URL: https://github.com/apache/arrow/pull/10511#discussion_r659950211
########## File path: cpp/src/arrow/array/builder_base.cc ########## @@ -92,6 +95,119 @@ Status ArrayBuilder::Advance(int64_t elements) { return null_bitmap_builder_.Advance(elements); } +struct AppendScalarImpl { + template <typename T, typename AppendScalar, + typename BuilderType = typename TypeTraits<T>::BuilderType, + typename ScalarType = typename TypeTraits<T>::ScalarType> + Status UseBuilder(const AppendScalar& append) { + for (const auto scalar : scalars_) { + if (scalar->is_valid) { + RETURN_NOT_OK(append(internal::checked_cast<const ScalarType&>(*scalar), + static_cast<BuilderType*>(builder_))); + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } + } + return Status::OK(); + } + + struct AppendValue { + template <typename BuilderType, typename ScalarType> + Status operator()(const ScalarType& s, BuilderType* builder) const { + return builder->Append(s.value); + } + }; + + struct AppendBuffer { + template <typename BuilderType, typename ScalarType> + Status operator()(const ScalarType& s, BuilderType* builder) const { + const Buffer& buffer = *s.value; + return builder->Append(util::string_view{buffer}); + } + }; + + struct AppendList { + template <typename BuilderType, typename ScalarType> + Status operator()(const ScalarType& s, BuilderType* builder) const { + RETURN_NOT_OK(builder->Append()); + const Array& list = *s.value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + return Status::OK(); + } + }; + + template <typename T> + enable_if_has_c_type<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendValue{}); + } + + template <typename T> + enable_if_has_string_view<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendBuffer{}); + } + + template <typename T> + enable_if_decimal<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendValue{}); + } + + template <typename T> + enable_if_list_like<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendList{}); + } + + Status Visit(const StructType& type) { + auto* builder = static_cast<StructBuilder*>(builder_); + for (const auto s : scalars_) { + const auto& scalar = internal::checked_cast<const StructScalar&>(*s); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + if (!scalar.is_valid || !scalar.value[field_index]) { + RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); + } else { + RETURN_NOT_OK(builder->field_builder(field_index) + ->AppendScalar(*scalar.value[field_index])); + } + } + RETURN_NOT_OK(builder->Append(scalar.is_valid)); + } + return Status::OK(); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("AppendScalar for type ", type); + } + + Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); } + + std::vector<const Scalar*> scalars_; + ArrayBuilder* builder_; +}; + +Status ArrayBuilder::AppendScalar(const Scalar& scalar) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + return AppendScalarImpl{{&scalar}, this}.Convert(); +} + +Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { + if (scalars.empty()) return Status::OK(); + std::vector<const Scalar*> refs; + refs.reserve(scalars.size()); + for (const auto& scalar : scalars) { + if (!scalar->type->Equals(type())) { Review comment: `ArrayBuilder::type` is virtual, so it should be kept out of loops when possible ```suggestion auto builder_type = type(); for (const auto& scalar : scalars) { if (!scalar->type->Equals(*builder_type)) { ``` ########## File path: cpp/src/arrow/array/builder_base.cc ########## @@ -92,6 +95,119 @@ Status ArrayBuilder::Advance(int64_t elements) { return null_bitmap_builder_.Advance(elements); } +struct AppendScalarImpl { + template <typename T, typename AppendScalar, + typename BuilderType = typename TypeTraits<T>::BuilderType, + typename ScalarType = typename TypeTraits<T>::ScalarType> + Status UseBuilder(const AppendScalar& append) { + for (const auto scalar : scalars_) { + if (scalar->is_valid) { + RETURN_NOT_OK(append(internal::checked_cast<const ScalarType&>(*scalar), + static_cast<BuilderType*>(builder_))); + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } + } + return Status::OK(); + } + + struct AppendValue { + template <typename BuilderType, typename ScalarType> + Status operator()(const ScalarType& s, BuilderType* builder) const { + return builder->Append(s.value); + } + }; + + struct AppendBuffer { + template <typename BuilderType, typename ScalarType> + Status operator()(const ScalarType& s, BuilderType* builder) const { + const Buffer& buffer = *s.value; + return builder->Append(util::string_view{buffer}); + } + }; + + struct AppendList { + template <typename BuilderType, typename ScalarType> + Status operator()(const ScalarType& s, BuilderType* builder) const { + RETURN_NOT_OK(builder->Append()); + const Array& list = *s.value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + return Status::OK(); + } + }; + + template <typename T> + enable_if_has_c_type<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendValue{}); + } + + template <typename T> + enable_if_has_string_view<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendBuffer{}); + } + + template <typename T> + enable_if_decimal<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendValue{}); + } + + template <typename T> + enable_if_list_like<T, Status> Visit(const T&) { + return UseBuilder<T>(AppendList{}); + } + + Status Visit(const StructType& type) { + auto* builder = static_cast<StructBuilder*>(builder_); + for (const auto s : scalars_) { + const auto& scalar = internal::checked_cast<const StructScalar&>(*s); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + if (!scalar.is_valid || !scalar.value[field_index]) { + RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); + } else { + RETURN_NOT_OK(builder->field_builder(field_index) + ->AppendScalar(*scalar.value[field_index])); + } + } + RETURN_NOT_OK(builder->Append(scalar.is_valid)); + } + return Status::OK(); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("AppendScalar for type ", type); + } + + Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); } + + std::vector<const Scalar*> scalars_; Review comment: Instead, let's ensure the ScalarVector case doesn't incur allocation or other O(N) costs: ```suggestion const std::shared_ptr<Scalar>* scalars_begin_; const std::shared_ptr<Scalar>* scalars_end_; ``` We can accommodate `ArrayBuilder::AppendScalar(const Scalar& scalar)` by declaring a shared pointer with an empty destructor: ```c++ Status ArrayBuilder::AppendScalar(const Scalar& scalar) { // ... std::shared_ptr<Scalar> shared{&scalar, [](Scalar*) {}}; return AppendScalarImpl{&shared, &shared + 1, this}.Convert(); } ``` ########## File path: cpp/src/arrow/compute/util_internal.h ########## @@ -17,6 +17,9 @@ #pragma once +#include <iosfwd> +#include <vector> + Review comment: ```suggestion ``` These headers should be included directly by the files which need them ########## File path: cpp/src/arrow/compute/function.h ########## @@ -39,12 +40,49 @@ namespace compute { /// /// @{ +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + /// \brief Convert this options struct into a Struct scalar. + virtual Status ToStructScalar(const FunctionOptions&, + std::vector<std::string>* field_names, + std::vector<std::shared_ptr<Scalar>>* values) const; + /// \brief Deserialize an options struct from a Struct scalar. + virtual Result<std::unique_ptr<FunctionOptions>> FromStructScalar( + const StructScalar& scalar) const; +}; + /// \brief Base class for specifying options configuring a function's behavior, /// such as error handling. -struct ARROW_EXPORT FunctionOptions { +class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> { + public: virtual ~FunctionOptions() = default; + + const FunctionOptionsType* options_type() const { return options_type_; } Review comment: ```suggestion const FunctionOptionsType* options_type() const { return options_type_; } const char* type_name() const { return options_type()->type_name(); } ``` ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -178,23 +206,22 @@ enum CompareOperator : int8_t { LESS_EQUAL, }; -struct CompareOptions : public FunctionOptions { - explicit CompareOptions(CompareOperator op) : op(op) {} +class ARROW_EXPORT CompareOptions : public FunctionOptions { Review comment: CompareOptions are not actually used by any compute::Function and should probably not inherit `FunctionOptions` at all ########## File path: cpp/src/arrow/compute/exec/expression.cc ########## @@ -994,86 +936,14 @@ Result<Expression> SimplifyWithGuarantee(Expression expr, namespace { -Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( - const Expression::Call& call) { - if (call.options == nullptr) { - return nullptr; - } - - if (auto options = GetSetLookupOptions(call)) { - if (!options->value_set.is_array()) { - return Status::NotImplemented("chunked value_set"); - } - return StructScalar::Make( - { - std::make_shared<ListScalar>(options->value_set.make_array()), - MakeScalar(options->skip_nulls), - }, - {"value_set", "skip_nulls"}); - } - - if (auto options = GetCastOptions(call)) { - return StructScalar::Make( - { - MakeNullScalar(options->to_type), - MakeScalar(options->allow_int_overflow), - MakeScalar(options->allow_time_truncate), - MakeScalar(options->allow_time_overflow), - MakeScalar(options->allow_decimal_truncate), - MakeScalar(options->allow_float_truncate), - MakeScalar(options->allow_invalid_utf8), - }, - { - "to_type_holder", - "allow_int_overflow", - "allow_time_truncate", - "allow_time_overflow", - "allow_decimal_truncate", - "allow_float_truncate", - "allow_invalid_utf8", - }); - } - - return Status::NotImplemented("conversion of options for ", call.function_name); -} - Status FunctionOptionsFromStructScalar(const StructScalar* repr, Expression::Call* call) { Review comment: This can just be inlined now ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; Review comment: Instead of being trait structures, these should probably be free functions like ```suggestion template <typename T> static std::string GenericToString(const T& value) { std::stringstream ss; ss << value; return ss.str(); } ``` which will be less boilerplate ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << PropertyToString<T>::ToString(*it); + } + ss << ']'; + return ss.str(); + } +}; + +template <typename T> +struct PropertyEquals { + static bool Equals(const T& left, const T& right) { return left == right; } +}; +template <typename T> +struct PropertyEquals<std::shared_ptr<T>> { + static bool Equals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) { + return (!left && !right) || (left && right && left->Equals(*right)); + } +}; +template <> +struct PropertyEquals<std::shared_ptr<const KeyValueMetadata>> { + static bool Equals(const std::shared_ptr<const KeyValueMetadata>& left, + const std::shared_ptr<const KeyValueMetadata>& right) { + // Special case since we serialize a null metadata into an empty one Review comment: ```suggestion // Special case since null metadata is considered equivalent to empty ``` ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << PropertyToString<T>::ToString(*it); + } + ss << ']'; + return ss.str(); + } +}; + +template <typename T> +struct PropertyEquals { + static bool Equals(const T& left, const T& right) { return left == right; } +}; +template <typename T> +struct PropertyEquals<std::shared_ptr<T>> { + static bool Equals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) { + return (!left && !right) || (left && right && left->Equals(*right)); + } +}; +template <> +struct PropertyEquals<std::shared_ptr<const KeyValueMetadata>> { + static bool Equals(const std::shared_ptr<const KeyValueMetadata>& left, + const std::shared_ptr<const KeyValueMetadata>& right) { + // Special case since we serialize a null metadata into an empty one + return (!left && !right) || (left && right && left->Equals(*right)) || + (!left && right && right->size() == 0) || + (left && !right && left->size() == 0); + } +}; +template <typename T> +struct PropertyEquals<std::vector<T>> { + static bool Equals(const std::vector<T>& left, const std::vector<T>& right) { + if (left.size() != right.size()) return false; + for (size_t i = 0; i < left.size(); i++) { + if (!PropertyEquals<T>::Equals(left[i], right[i])) return false; + } + return true; + } +}; + +template <typename T, typename Enable = void> +struct PropertyToScalar { + static Result<std::shared_ptr<Scalar>> ToScalar(const T& value) { + return Status::NotImplemented("Cannot serialize field"); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <typename T> +struct PropertyToScalar< + T, arrow::internal::void_t<decltype(MakeScalar(std::declval<T>()))>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const T& value) { + return MakeScalar(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton(); + } +}; +template <typename T> +struct PropertyToScalar<std::vector<T>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::vector<T>& value) { + std::shared_ptr<DataType> type; + if (value.empty()) { + ARROW_ASSIGN_OR_RAISE(type, PropertyToScalar<T>::GetTypeSingleton()); + } + std::vector<std::shared_ptr<Scalar>> scalars; + scalars.reserve(value.size()); + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, PropertyToScalar<T>::ToScalar(*it)); + scalars.push_back(std::move(scalar)); + } + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK( + MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder)); + RETURN_NOT_OK(builder->AppendScalars(scalars)); + std::shared_ptr<Array> out; + RETURN_NOT_OK(builder->Finish(&out)); + return std::make_shared<ListScalar>(std::move(out)); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<const KeyValueMetadata>> { + static Result<std::shared_ptr<Scalar>> ToScalar( + const std::shared_ptr<const KeyValueMetadata>& value) { + ARROW_ASSIGN_OR_RAISE(auto ty, GetTypeSingleton()); + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder)); + auto* map_builder = checked_cast<MapBuilder*>(builder.get()); + auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder()); + auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder()); + RETURN_NOT_OK(map_builder->Append()); + if (value) { + RETURN_NOT_OK(key_builder->AppendValues(value->keys())); + RETURN_NOT_OK(item_builder->AppendValues(value->values())); + } + std::shared_ptr<Array> arr; + RETURN_NOT_OK(map_builder->Finish(&arr)); + return arr->GetScalar(0); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return map(binary(), binary()); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<DataType>> { + static Result<std::shared_ptr<Scalar>> ToScalar( + const std::shared_ptr<DataType>& value) { + if (!value) { + return Status::Invalid("shared_ptr<DataType> is nullptr"); + } + return MakeNullScalar(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<Scalar>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::shared_ptr<Scalar>& value) { + return value; + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<Array>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::shared_ptr<Array>& value) { + return std::make_shared<ListScalar>(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<Datum> { + static Result<std::shared_ptr<Scalar>> ToScalar(const Datum& value) { + // TODO(ARROW-9434): store in a union instead. + switch (value.kind()) { + case Datum::ARRAY: + return PropertyToScalar<std::shared_ptr<Array>>::ToScalar(value.make_array()); + break; + default: + return Status::NotImplemented("Cannot serialize Datum kind ", value.kind()); + } + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; + +template <typename T, typename Enable = void> +struct PropertyFromScalar { + static Result<T> FromScalar(const std::shared_ptr<Scalar>& value) { + return Status::NotImplemented("Cannot deserialize field"); + } +}; +template <typename T> +struct PropertyFromScalar<T, + enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType>> { + using ArrowType = typename CTypeTraits<T>::ArrowType; + using ScalarType = typename TypeTraits<ArrowType>::ScalarType; + static Result<T> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() != ArrowType::type_id) { + return Status::Invalid("Expected type ", ArrowType::type_id, " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const ScalarType&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value; + } +}; +template <> +struct PropertyFromScalar<std::string> { + static Result<std::string> FromScalar(const std::shared_ptr<Scalar>& value) { + if (!is_base_binary_like(value->type->id())) { + return Status::Invalid("Expected binary-like type but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const BaseBinaryScalar&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value->ToString(); + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<DataType>> { + static Result<std::shared_ptr<DataType>> FromScalar( + const std::shared_ptr<Scalar>& value) { + return value->type; + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<Scalar>> { + static Result<std::shared_ptr<Scalar>> FromScalar( + const std::shared_ptr<Scalar>& value) { + return value; + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<const KeyValueMetadata>> { + static Result<std::shared_ptr<const KeyValueMetadata>> FromScalar( + const std::shared_ptr<Scalar>& value) { + ARROW_ASSIGN_OR_RAISE( + const auto ty, + PropertyToScalar<std::shared_ptr<const KeyValueMetadata>>::GetTypeSingleton()); + if (!value->type->Equals(ty)) { + return Status::Invalid("Expected ", ty->ToString(), " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const MapScalar&>(*value); + std::vector<std::string> keys; + std::vector<std::string> values; + const auto& list = checked_cast<const StructArray&>(*holder.value); + const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0)); + const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1)); + for (int64_t i = 0; i < list.length(); i++) { + keys.push_back(key_arr.GetString(i)); + values.push_back(value_arr.GetString(i)); + } + return key_value_metadata(std::move(keys), std::move(values)); + } +}; +template <> +struct PropertyFromScalar<Datum> { + static Result<Datum> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() == Type::LIST) { + const auto& holder = checked_cast<const BaseListScalar&>(*value); + return holder.value; + } + // TODO(ARROW-9434): handle other possible datum kinds by looking for a union + return Status::Invalid("Cannot deserialize Datum from ", value->ToString()); + } +}; + +template <typename T> +struct PropertyFromScalar<std::vector<T>> { + static Result<std::vector<T>> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() != Type::LIST) { + return Status::Invalid("Expected type LIST but got ", value->type->ToString()); + } + const auto& holder = checked_cast<const BaseListScalar&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + std::vector<T> result; + for (int i = 0; i < holder.value->length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i)); + ARROW_ASSIGN_OR_RAISE(auto v, PropertyFromScalar<T>::FromScalar(scalar)); + result.push_back(std::move(v)); + } + return result; + } +}; + +// Helper to quickly define serialization for a C-style enum +#define MAKE_ENUM_PROPERTY(ENUM, TYPE, MIN, MAX) \ Review comment: Instead, let's add some enum utilities to `reflection_internal.h`: ```c++ // reflection_internal.h template <typename Enum> struct EnumTraits; template <typename Enum, typename CType = typename std::underlying_type<Enum>::type> Status ValidateEnumValue(CType raw) { for (auto valid : EnumTraits<Enum>::kValues) { if (raw == static_cast<CType>(valid)) return Status::OK(); } return Status::Invalid("Invalid value for ", EnumTraits<Enum>::kName, ": ", raw); } template <typename Enum, Enum... E> struct BasicEnumTraits { static const std::array<Enum, sizeof...(E)> kValues = {E...}; }; template <typename Enum, typename CType = typename std::underlying_type<Enum>::type, typename Type = typename CTypeTraits<T>::ArrowType> static std::shared_ptr<DataType> UnderlyingType() { return TypeTraits<Type>::type_singleton(); } template <> struct EnumTraits<JoinOptions::NullHandlingBehavior> : BasicEnumTraits<JoinOptions::NullHandlingBehavior, JoinOptions::NullHandlingBehavior::EMIT_NULL, JoinOptions::NullHandlingBehavior::SKIP, JoinOptions::NullHandlingBehavior::REPLACE> { static constexpr char kName[] = "JoinOptions::NullHandlingBehavior"; }; ``` ########## File path: cpp/src/arrow/compute/api_aggregate.cc ########## @@ -17,11 +17,112 @@ #include "arrow/compute/api_aggregate.h" +#include <sstream> Review comment: ```suggestion ``` ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << PropertyToString<T>::ToString(*it); + } + ss << ']'; + return ss.str(); + } +}; + +template <typename T> +struct PropertyEquals { + static bool Equals(const T& left, const T& right) { return left == right; } +}; +template <typename T> +struct PropertyEquals<std::shared_ptr<T>> { + static bool Equals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) { + return (!left && !right) || (left && right && left->Equals(*right)); Review comment: ```suggestion if (left && right) { return left->Equals(*right); } return left == right; ``` ########## File path: cpp/src/arrow/compute/function_test.cc ########## @@ -21,16 +21,110 @@ #include <gtest/gtest.h> +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/function.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/key_value_metadata.h" namespace arrow { namespace compute { +TEST(FunctionOptions, Equality) { + std::vector<std::shared_ptr<FunctionOptions>> options; + options.emplace_back(new ScalarAggregateOptions()); + options.emplace_back(new ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/1)); + options.emplace_back(new ModeOptions()); + options.emplace_back(new ModeOptions(/*n=*/2)); + options.emplace_back(new VarianceOptions()); + options.emplace_back(new VarianceOptions(/*ddof=*/2)); + options.emplace_back(new QuantileOptions()); + options.emplace_back( + new QuantileOptions(/*q=*/0.75, QuantileOptions::Interpolation::MIDPOINT)); + options.emplace_back(new TDigestOptions()); + options.emplace_back( + new TDigestOptions(/*q=*/0.75, /*delta=*/50, /*buffer_size=*/1024)); + options.emplace_back(new IndexOptions(ScalarFromJSON(int64(), "16"))); + options.emplace_back(new IndexOptions(ScalarFromJSON(boolean(), "true"))); + options.emplace_back(new IndexOptions(ScalarFromJSON(boolean(), "null"))); + options.emplace_back(new ArithmeticOptions()); + options.emplace_back(new ArithmeticOptions(/*check_overflow=*/true)); + options.emplace_back(new ElementWiseAggregateOptions()); + options.emplace_back(new ElementWiseAggregateOptions(/*skip_nulls=*/false)); + options.emplace_back(new JoinOptions()); + options.emplace_back(new JoinOptions(JoinOptions::REPLACE, "replacement")); + options.emplace_back(new MatchSubstringOptions("pattern")); + options.emplace_back(new MatchSubstringOptions("pattern", /*ignore_case=*/true)); + options.emplace_back(new SplitOptions()); + options.emplace_back(new SplitOptions(/*max_splits=*/2, /*reverse=*/true)); + options.emplace_back(new SplitPatternOptions("pattern")); + options.emplace_back( + new SplitPatternOptions("pattern", /*max_splits=*/2, /*reverse=*/true)); + options.emplace_back(new ReplaceSubstringOptions("pattern", "replacement")); + options.emplace_back( + new ReplaceSubstringOptions("pattern", "replacement", /*max_replacements=*/2)); + options.emplace_back(new ReplaceSliceOptions(0, 1, "foo")); + options.emplace_back(new ReplaceSliceOptions(1, -1, "bar")); + options.emplace_back(new ExtractRegexOptions("pattern")); + options.emplace_back(new ExtractRegexOptions("pattern2")); + options.emplace_back(new SetLookupOptions(ArrayFromJSON(int64(), "[1, 2, 3, 4]"))); + options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]"))); + options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); + options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); + options.emplace_back(new TrimOptions(" ")); + options.emplace_back(new TrimOptions("abc")); + options.emplace_back(new SliceOptions(/*start=*/1)); + options.emplace_back(new SliceOptions(/*start=*/1, /*stop=*/-5, /*step=*/-2)); + options.emplace_back(new CompareOptions(CompareOperator::EQUAL)); + options.emplace_back(new CompareOptions(CompareOperator::NOT_EQUAL)); + // N.B. we never actually use field_nullability or field_metadata in Arrow + options.emplace_back(new ProjectOptions({"col1"}, {true}, {})); + options.emplace_back(new ProjectOptions({"col1"}, {false}, {})); + options.emplace_back( + new ProjectOptions({"col1"}, {false}, {key_value_metadata({{"key", "val"}})})); + options.emplace_back(new CastOptions(CastOptions::Safe(boolean()))); + options.emplace_back(new CastOptions(CastOptions::Unsafe(int64()))); + options.emplace_back(new FilterOptions()); + options.emplace_back( + new FilterOptions(FilterOptions::NullSelectionBehavior::EMIT_NULL)); + options.emplace_back(new TakeOptions()); + options.emplace_back(new TakeOptions(/*boundscheck=*/false)); + options.emplace_back(new DictionaryEncodeOptions()); + options.emplace_back( + new DictionaryEncodeOptions(DictionaryEncodeOptions::NullEncodingBehavior::ENCODE)); + options.emplace_back(new ArraySortOptions()); + options.emplace_back(new ArraySortOptions(SortOrder::Descending)); + options.emplace_back(new SortOptions()); + options.emplace_back(new SortOptions({SortKey("key", SortOrder::Ascending)})); + options.emplace_back(new SortOptions({SortKey("key", SortOrder::Descending)})); + options.emplace_back(new PartitionNthOptions(/*pivot=*/0)); + options.emplace_back(new PartitionNthOptions(/*pivot=*/42)); + + for (size_t i = 0; i < options.size(); i++) { + const size_t prev_i = i == 0 ? options.size() - 1 : i - 1; + const FunctionOptions& cur = *options[i]; + const FunctionOptions& prev = *options[prev_i]; + SCOPED_TRACE(cur.options_type()->type_name()); + SCOPED_TRACE(cur.ToString()); + ASSERT_TRUE(cur == cur); Review comment: ```suggestion ASSERT_EQ(cur, cur); ``` ########## File path: cpp/src/arrow/compute/function.h ########## @@ -39,12 +40,49 @@ namespace compute { /// /// @{ +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + /// \brief Convert this options struct into a Struct scalar. + virtual Status ToStructScalar(const FunctionOptions&, + std::vector<std::string>* field_names, + std::vector<std::shared_ptr<Scalar>>* values) const; Review comment: Again, the public methods should be serialize/deserialize and from/to struct scalar can be details of `GenericOptionsType` ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << PropertyToString<T>::ToString(*it); + } + ss << ']'; + return ss.str(); + } +}; + +template <typename T> +struct PropertyEquals { + static bool Equals(const T& left, const T& right) { return left == right; } +}; +template <typename T> +struct PropertyEquals<std::shared_ptr<T>> { + static bool Equals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) { + return (!left && !right) || (left && right && left->Equals(*right)); + } +}; +template <> +struct PropertyEquals<std::shared_ptr<const KeyValueMetadata>> { + static bool Equals(const std::shared_ptr<const KeyValueMetadata>& left, + const std::shared_ptr<const KeyValueMetadata>& right) { + // Special case since we serialize a null metadata into an empty one + return (!left && !right) || (left && right && left->Equals(*right)) || + (!left && right && right->size() == 0) || + (left && !right && left->size() == 0); + } Review comment: ```suggestion if (IsEmpty(left) || IsEmpty(right)) { return IsEmpty(left) && IsEmpty(right); } return left->Equals(*right); } static bool IsEmpty(const std::shared_ptr<const KeyValueMetadata>& meta) { return meta == nullptr || meta->size() == 0; } ``` ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis Review comment: I'm not sure why this needed to be avoided; shouldn't it be alright to use `const auto&`? ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << PropertyToString<T>::ToString(*it); + } + ss << ']'; + return ss.str(); + } +}; + +template <typename T> +struct PropertyEquals { + static bool Equals(const T& left, const T& right) { return left == right; } +}; +template <typename T> +struct PropertyEquals<std::shared_ptr<T>> { + static bool Equals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) { + return (!left && !right) || (left && right && left->Equals(*right)); + } +}; +template <> +struct PropertyEquals<std::shared_ptr<const KeyValueMetadata>> { + static bool Equals(const std::shared_ptr<const KeyValueMetadata>& left, + const std::shared_ptr<const KeyValueMetadata>& right) { + // Special case since we serialize a null metadata into an empty one + return (!left && !right) || (left && right && left->Equals(*right)) || + (!left && right && right->size() == 0) || + (left && !right && left->size() == 0); + } +}; +template <typename T> +struct PropertyEquals<std::vector<T>> { + static bool Equals(const std::vector<T>& left, const std::vector<T>& right) { + if (left.size() != right.size()) return false; + for (size_t i = 0; i < left.size(); i++) { + if (!PropertyEquals<T>::Equals(left[i], right[i])) return false; + } + return true; + } +}; + +template <typename T, typename Enable = void> +struct PropertyToScalar { + static Result<std::shared_ptr<Scalar>> ToScalar(const T& value) { + return Status::NotImplemented("Cannot serialize field"); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <typename T> +struct PropertyToScalar< + T, arrow::internal::void_t<decltype(MakeScalar(std::declval<T>()))>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const T& value) { + return MakeScalar(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton(); + } +}; +template <typename T> +struct PropertyToScalar<std::vector<T>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::vector<T>& value) { + std::shared_ptr<DataType> type; + if (value.empty()) { + ARROW_ASSIGN_OR_RAISE(type, PropertyToScalar<T>::GetTypeSingleton()); + } + std::vector<std::shared_ptr<Scalar>> scalars; + scalars.reserve(value.size()); + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, PropertyToScalar<T>::ToScalar(*it)); + scalars.push_back(std::move(scalar)); + } + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK( + MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder)); + RETURN_NOT_OK(builder->AppendScalars(scalars)); + std::shared_ptr<Array> out; + RETURN_NOT_OK(builder->Finish(&out)); + return std::make_shared<ListScalar>(std::move(out)); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<const KeyValueMetadata>> { + static Result<std::shared_ptr<Scalar>> ToScalar( + const std::shared_ptr<const KeyValueMetadata>& value) { + ARROW_ASSIGN_OR_RAISE(auto ty, GetTypeSingleton()); + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder)); + auto* map_builder = checked_cast<MapBuilder*>(builder.get()); + auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder()); + auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder()); + RETURN_NOT_OK(map_builder->Append()); + if (value) { + RETURN_NOT_OK(key_builder->AppendValues(value->keys())); + RETURN_NOT_OK(item_builder->AppendValues(value->values())); + } + std::shared_ptr<Array> arr; + RETURN_NOT_OK(map_builder->Finish(&arr)); + return arr->GetScalar(0); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return map(binary(), binary()); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<DataType>> { + static Result<std::shared_ptr<Scalar>> ToScalar( + const std::shared_ptr<DataType>& value) { + if (!value) { + return Status::Invalid("shared_ptr<DataType> is nullptr"); + } + return MakeNullScalar(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<Scalar>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::shared_ptr<Scalar>& value) { + return value; + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<Array>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::shared_ptr<Array>& value) { + return std::make_shared<ListScalar>(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<Datum> { + static Result<std::shared_ptr<Scalar>> ToScalar(const Datum& value) { + // TODO(ARROW-9434): store in a union instead. + switch (value.kind()) { + case Datum::ARRAY: + return PropertyToScalar<std::shared_ptr<Array>>::ToScalar(value.make_array()); + break; + default: + return Status::NotImplemented("Cannot serialize Datum kind ", value.kind()); + } + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; + +template <typename T, typename Enable = void> +struct PropertyFromScalar { + static Result<T> FromScalar(const std::shared_ptr<Scalar>& value) { + return Status::NotImplemented("Cannot deserialize field"); + } +}; +template <typename T> +struct PropertyFromScalar<T, + enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType>> { + using ArrowType = typename CTypeTraits<T>::ArrowType; + using ScalarType = typename TypeTraits<ArrowType>::ScalarType; + static Result<T> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() != ArrowType::type_id) { + return Status::Invalid("Expected type ", ArrowType::type_id, " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const ScalarType&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value; + } +}; +template <> +struct PropertyFromScalar<std::string> { + static Result<std::string> FromScalar(const std::shared_ptr<Scalar>& value) { + if (!is_base_binary_like(value->type->id())) { + return Status::Invalid("Expected binary-like type but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const BaseBinaryScalar&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value->ToString(); + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<DataType>> { + static Result<std::shared_ptr<DataType>> FromScalar( + const std::shared_ptr<Scalar>& value) { + return value->type; + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<Scalar>> { + static Result<std::shared_ptr<Scalar>> FromScalar( + const std::shared_ptr<Scalar>& value) { + return value; + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<const KeyValueMetadata>> { + static Result<std::shared_ptr<const KeyValueMetadata>> FromScalar( + const std::shared_ptr<Scalar>& value) { + ARROW_ASSIGN_OR_RAISE( + const auto ty, + PropertyToScalar<std::shared_ptr<const KeyValueMetadata>>::GetTypeSingleton()); + if (!value->type->Equals(ty)) { + return Status::Invalid("Expected ", ty->ToString(), " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const MapScalar&>(*value); + std::vector<std::string> keys; + std::vector<std::string> values; + const auto& list = checked_cast<const StructArray&>(*holder.value); + const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0)); + const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1)); + for (int64_t i = 0; i < list.length(); i++) { + keys.push_back(key_arr.GetString(i)); + values.push_back(value_arr.GetString(i)); + } + return key_value_metadata(std::move(keys), std::move(values)); + } +}; +template <> +struct PropertyFromScalar<Datum> { + static Result<Datum> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() == Type::LIST) { + const auto& holder = checked_cast<const BaseListScalar&>(*value); + return holder.value; + } + // TODO(ARROW-9434): handle other possible datum kinds by looking for a union + return Status::Invalid("Cannot deserialize Datum from ", value->ToString()); + } +}; + +template <typename T> +struct PropertyFromScalar<std::vector<T>> { + static Result<std::vector<T>> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() != Type::LIST) { + return Status::Invalid("Expected type LIST but got ", value->type->ToString()); + } + const auto& holder = checked_cast<const BaseListScalar&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + std::vector<T> result; + for (int i = 0; i < holder.value->length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i)); + ARROW_ASSIGN_OR_RAISE(auto v, PropertyFromScalar<T>::FromScalar(scalar)); + result.push_back(std::move(v)); + } + return result; + } +}; + +// Helper to quickly define serialization for a C-style enum +#define MAKE_ENUM_PROPERTY(ENUM, TYPE, MIN, MAX) \ + template <> \ + struct PropertyToScalar<ENUM> { \ + static Result<std::shared_ptr<Scalar>> ToScalar(const ENUM value) { \ + return PropertyToScalar<TYPE>::ToScalar(static_cast<TYPE>(value)); \ + } \ + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { \ + return PropertyToScalar<TYPE>::GetTypeSingleton(); \ + } \ + }; \ + template <> \ + struct PropertyFromScalar<ENUM> { \ + static Result<ENUM> FromScalar(const std::shared_ptr<Scalar>& value) { \ + ARROW_ASSIGN_OR_RAISE(auto raw_val, PropertyFromScalar<TYPE>::FromScalar(value)); \ + if (raw_val < static_cast<TYPE>(MIN) || raw_val > static_cast<TYPE>(MAX)) { \ + return Status::Invalid("Got invalid value for ", ARROW_STRINGIFY(ENUM), ": ", \ + raw_val); \ + } \ + return static_cast<ENUM>(raw_val); \ + } \ + }; + +#define MAKE_OPTIONS_TYPE(OPTIONS, CLASS, GETTER, PROPERTIES) \ + using CLASS = decltype(PROPERTIES)::Apply<GenericOptionsType, OPTIONS>::type; \ + const FunctionOptionsType* GETTER() { \ + static const CLASS instance(PROPERTIES); \ + return &instance; \ + } + Review comment: ```suggestion template <typename Options, typename... Properties> const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) { static const struct : GenericOptionsType<Options, Properties...> { using GenericOptionsType<Options, Properties>::GenericOptionsType; } instance(arrow::internal::MakeProperties(properties...)); return &instance; } ``` (It'd probably be fine to inline `GenericOptionsType` there, actually.) Then this can be used later like so: ```c++ static auto kCastOptionsType = GetFunctionOptionsType<CastOptions>( arrow::internal::DataMember("to_type", &CastOptions::to_type), arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow), /*...*/); } // namespace void RegisterScalarCast(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared<CastMetaFunction>())); DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType)); } } // namespace internal CastOptions::CastOptions(bool safe) : FunctionOptions(kCastOptionsType), allow_int_overflow(!safe), //... ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org