lidavidm commented on a change in pull request #10511: URL: https://github.com/apache/arrow/pull/10511#discussion_r660071017
########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis Review comment: Clang complains when that gets instantiated with `std::vector<bool>` because, well vector<bool> is special :upside_down_face: ########## File path: cpp/src/arrow/compute/util_internal.h ########## @@ -17,6 +17,9 @@ #pragma once +#include <iosfwd> +#include <vector> + Review comment: Sorry, these were left over from a change I didn't back out fully ########## File path: cpp/src/arrow/compute/function.h ########## @@ -39,12 +40,49 @@ namespace compute { /// /// @{ +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + /// \brief Convert this options struct into a Struct scalar. + virtual Status ToStructScalar(const FunctionOptions&, + std::vector<std::string>* field_names, + std::vector<std::shared_ptr<Scalar>>* values) const; Review comment: Ah, sorry, I misunderstood before. This raises a question though: to know the type_name to deserialize, we need to make some assumption about the serialized data. (Or, as before, we need to register a mapping between function and options type.) ########## File path: cpp/src/arrow/compute/function_internal.h ########## @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; +namespace compute { +namespace internal { +ARROW_EXPORT +Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar( + const StructScalar&); + +template <typename T> +struct PropertyToString { + static std::string ToString(const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); + } +}; +template <> +struct PropertyToString<bool> { + static std::string ToString(bool value) { return value ? "true" : "false"; } +}; +template <> +struct PropertyToString<std::string> { + static std::string ToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); + } +}; +template <typename T> +struct PropertyToString<std::shared_ptr<T>> { + static std::string ToString(const std::shared_ptr<T>& value) { + std::stringstream ss; + return value ? value->ToString() : "<NULLPTR>"; + } +}; +template <> +struct PropertyToString<std::shared_ptr<Scalar>> { + static std::string ToString(const std::shared_ptr<Scalar>& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); + } +}; +template <> +struct PropertyToString<std::shared_ptr<const KeyValueMetadata>> { + static std::string ToString(const std::shared_ptr<const KeyValueMetadata>& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); + } +}; +template <> +struct PropertyToString<Datum> { + static std::string ToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return "<NULL DATUM>"; + case Datum::SCALAR: + return PropertyToString<std::shared_ptr<Scalar>>::ToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); + } +}; +template <typename T> +struct PropertyToString<std::vector<T>> { + static std::string ToString(const std::vector<T>& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << PropertyToString<T>::ToString(*it); + } + ss << ']'; + return ss.str(); + } +}; + +template <typename T> +struct PropertyEquals { + static bool Equals(const T& left, const T& right) { return left == right; } +}; +template <typename T> +struct PropertyEquals<std::shared_ptr<T>> { + static bool Equals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) { + return (!left && !right) || (left && right && left->Equals(*right)); + } +}; +template <> +struct PropertyEquals<std::shared_ptr<const KeyValueMetadata>> { + static bool Equals(const std::shared_ptr<const KeyValueMetadata>& left, + const std::shared_ptr<const KeyValueMetadata>& right) { + // Special case since we serialize a null metadata into an empty one + return (!left && !right) || (left && right && left->Equals(*right)) || + (!left && right && right->size() == 0) || + (left && !right && left->size() == 0); + } +}; +template <typename T> +struct PropertyEquals<std::vector<T>> { + static bool Equals(const std::vector<T>& left, const std::vector<T>& right) { + if (left.size() != right.size()) return false; + for (size_t i = 0; i < left.size(); i++) { + if (!PropertyEquals<T>::Equals(left[i], right[i])) return false; + } + return true; + } +}; + +template <typename T, typename Enable = void> +struct PropertyToScalar { + static Result<std::shared_ptr<Scalar>> ToScalar(const T& value) { + return Status::NotImplemented("Cannot serialize field"); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <typename T> +struct PropertyToScalar< + T, arrow::internal::void_t<decltype(MakeScalar(std::declval<T>()))>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const T& value) { + return MakeScalar(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton(); + } +}; +template <typename T> +struct PropertyToScalar<std::vector<T>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::vector<T>& value) { + std::shared_ptr<DataType> type; + if (value.empty()) { + ARROW_ASSIGN_OR_RAISE(type, PropertyToScalar<T>::GetTypeSingleton()); + } + std::vector<std::shared_ptr<Scalar>> scalars; + scalars.reserve(value.size()); + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, PropertyToScalar<T>::ToScalar(*it)); + scalars.push_back(std::move(scalar)); + } + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK( + MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder)); + RETURN_NOT_OK(builder->AppendScalars(scalars)); + std::shared_ptr<Array> out; + RETURN_NOT_OK(builder->Finish(&out)); + return std::make_shared<ListScalar>(std::move(out)); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<const KeyValueMetadata>> { + static Result<std::shared_ptr<Scalar>> ToScalar( + const std::shared_ptr<const KeyValueMetadata>& value) { + ARROW_ASSIGN_OR_RAISE(auto ty, GetTypeSingleton()); + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder)); + auto* map_builder = checked_cast<MapBuilder*>(builder.get()); + auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder()); + auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder()); + RETURN_NOT_OK(map_builder->Append()); + if (value) { + RETURN_NOT_OK(key_builder->AppendValues(value->keys())); + RETURN_NOT_OK(item_builder->AppendValues(value->values())); + } + std::shared_ptr<Array> arr; + RETURN_NOT_OK(map_builder->Finish(&arr)); + return arr->GetScalar(0); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return map(binary(), binary()); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<DataType>> { + static Result<std::shared_ptr<Scalar>> ToScalar( + const std::shared_ptr<DataType>& value) { + if (!value) { + return Status::Invalid("shared_ptr<DataType> is nullptr"); + } + return MakeNullScalar(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<Scalar>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::shared_ptr<Scalar>& value) { + return value; + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<std::shared_ptr<Array>> { + static Result<std::shared_ptr<Scalar>> ToScalar(const std::shared_ptr<Array>& value) { + return std::make_shared<ListScalar>(value); + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; +template <> +struct PropertyToScalar<Datum> { + static Result<std::shared_ptr<Scalar>> ToScalar(const Datum& value) { + // TODO(ARROW-9434): store in a union instead. + switch (value.kind()) { + case Datum::ARRAY: + return PropertyToScalar<std::shared_ptr<Array>>::ToScalar(value.make_array()); + break; + default: + return Status::NotImplemented("Cannot serialize Datum kind ", value.kind()); + } + } + + static Result<std::shared_ptr<DataType>> GetTypeSingleton() { + return Status::NotImplemented("Cannot get type for field"); + } +}; + +template <typename T, typename Enable = void> +struct PropertyFromScalar { + static Result<T> FromScalar(const std::shared_ptr<Scalar>& value) { + return Status::NotImplemented("Cannot deserialize field"); + } +}; +template <typename T> +struct PropertyFromScalar<T, + enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType>> { + using ArrowType = typename CTypeTraits<T>::ArrowType; + using ScalarType = typename TypeTraits<ArrowType>::ScalarType; + static Result<T> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() != ArrowType::type_id) { + return Status::Invalid("Expected type ", ArrowType::type_id, " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const ScalarType&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value; + } +}; +template <> +struct PropertyFromScalar<std::string> { + static Result<std::string> FromScalar(const std::shared_ptr<Scalar>& value) { + if (!is_base_binary_like(value->type->id())) { + return Status::Invalid("Expected binary-like type but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const BaseBinaryScalar&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value->ToString(); + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<DataType>> { + static Result<std::shared_ptr<DataType>> FromScalar( + const std::shared_ptr<Scalar>& value) { + return value->type; + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<Scalar>> { + static Result<std::shared_ptr<Scalar>> FromScalar( + const std::shared_ptr<Scalar>& value) { + return value; + } +}; +template <> +struct PropertyFromScalar<std::shared_ptr<const KeyValueMetadata>> { + static Result<std::shared_ptr<const KeyValueMetadata>> FromScalar( + const std::shared_ptr<Scalar>& value) { + ARROW_ASSIGN_OR_RAISE( + const auto ty, + PropertyToScalar<std::shared_ptr<const KeyValueMetadata>>::GetTypeSingleton()); + if (!value->type->Equals(ty)) { + return Status::Invalid("Expected ", ty->ToString(), " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast<const MapScalar&>(*value); + std::vector<std::string> keys; + std::vector<std::string> values; + const auto& list = checked_cast<const StructArray&>(*holder.value); + const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0)); + const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1)); + for (int64_t i = 0; i < list.length(); i++) { + keys.push_back(key_arr.GetString(i)); + values.push_back(value_arr.GetString(i)); + } + return key_value_metadata(std::move(keys), std::move(values)); + } +}; +template <> +struct PropertyFromScalar<Datum> { + static Result<Datum> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() == Type::LIST) { + const auto& holder = checked_cast<const BaseListScalar&>(*value); + return holder.value; + } + // TODO(ARROW-9434): handle other possible datum kinds by looking for a union + return Status::Invalid("Cannot deserialize Datum from ", value->ToString()); + } +}; + +template <typename T> +struct PropertyFromScalar<std::vector<T>> { + static Result<std::vector<T>> FromScalar(const std::shared_ptr<Scalar>& value) { + if (value->type->id() != Type::LIST) { + return Status::Invalid("Expected type LIST but got ", value->type->ToString()); + } + const auto& holder = checked_cast<const BaseListScalar&>(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + std::vector<T> result; + for (int i = 0; i < holder.value->length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i)); + ARROW_ASSIGN_OR_RAISE(auto v, PropertyFromScalar<T>::FromScalar(scalar)); + result.push_back(std::move(v)); + } + return result; + } +}; + +// Helper to quickly define serialization for a C-style enum +#define MAKE_ENUM_PROPERTY(ENUM, TYPE, MIN, MAX) \ Review comment: This leads to some more boilerplate because 1) kValues needs to be constexpr to be initialized like so and 2) then kValues and kName need separate declarations outside the struct or the linker will complain about a missing symbol. (So every specialization needs a `static constexpr std::array<...> BasicEnumTraits<..., ..., ...>::kValues;`). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org