bkietz commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r492217469
########## File path: cpp/src/arrow/util/converter.h ########## @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <string> +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +template <typename BaseConverter, template <typename...> class ConverterTrait> +static Result<std::shared_ptr<BaseConverter>> MakeConverter( + std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options, + MemoryPool* pool); + +template <typename Input, typename Options> +class Converter { + public: + using Self = Converter<Input, Options>; + using InputType = Input; + using OptionsType = Options; + + virtual ~Converter() = default; + + Status Construct(std::shared_ptr<DataType> type, OptionsType options, + MemoryPool* pool) { + type_ = std::move(type); + options_ = std::move(options); + return Init(pool); + } + + virtual Status Append(InputType value) { + return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr<ArrayBuilder>& builder() const { return builder_; } + + const std::shared_ptr<DataType>& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector<std::shared_ptr<Self>>& children() const { return children_; } + + Status Reserve(int64_t additional_capacity) { + return builder_->Reserve(additional_capacity); + } + + Status AppendNull() { return builder_->AppendNull(); } + + virtual Result<std::shared_ptr<Array>> ToArray() { return builder_->Finish(); } + + virtual Result<std::shared_ptr<Array>> ToArray(int64_t length) { + ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); + return arr->Slice(0, length); + } + + protected: + virtual Status Init(MemoryPool* pool) { return Status::OK(); } + + std::shared_ptr<DataType> type_; + std::shared_ptr<ArrayBuilder> builder_; + std::vector<std::shared_ptr<Self>> children_; + OptionsType options_; +}; + +template <typename T, typename BaseConverter> +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits<T>::BuilderType; + + protected: + Status Init(MemoryPool* pool) override { + this->builder_ = std::make_shared<BuilderType>(this->type_, pool); + this->primitive_type_ = checked_cast<const T*>(this->type_.get()); + this->primitive_builder_ = checked_cast<BuilderType*>(this->builder_.get()); + return Status::OK(); + } + + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template <typename T, typename BaseConverter, template <typename...> class ConverterTrait> +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits<T>::BuilderType; + using ConverterType = typename ConverterTrait<T>::type; + + protected: + Status Init(MemoryPool* pool) override { + list_type_ = checked_cast<const T*>(this->type_.get()); + ARROW_ASSIGN_OR_RAISE(value_converter_, + (MakeConverter<BaseConverter, ConverterTrait>( + list_type_->value_type(), this->options_, pool))); + this->builder_ = + std::make_shared<BuilderType>(pool, value_converter_->builder(), this->type_); + this->children_ = {value_converter_}; + list_builder_ = checked_cast<BuilderType*>(this->builder_.get()); + return Status::OK(); + } + + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr<BaseConverter> value_converter_; +}; + +template <typename BaseConverter, template <typename...> class ConverterTrait> +class StructConverter : public BaseConverter { + public: + using ConverterType = typename ConverterTrait<StructType>::type; + + protected: + Status Init(MemoryPool* pool) override { + std::shared_ptr<BaseConverter> child_converter; + std::vector<std::shared_ptr<ArrayBuilder>> child_builders; + + struct_type_ = checked_cast<const StructType*>(this->type_.get()); + for (const auto& field : struct_type_->fields()) { + ARROW_ASSIGN_OR_RAISE(child_converter, + (MakeConverter<BaseConverter, ConverterTrait>( + field->type(), this->options_, pool))); + child_builders.push_back(child_converter->builder()); + this->children_.push_back(std::move(child_converter)); + } + + this->builder_ = + std::make_shared<StructBuilder>(this->type_, pool, std::move(child_builders)); + struct_builder_ = checked_cast<StructBuilder*>(this->builder_.get()); + + return Status::OK(); + } + + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template <typename U, typename BaseConverter> +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder<U>; + + protected: + Status Init(MemoryPool* pool) override { + std::unique_ptr<ArrayBuilder> builder; + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder)); + this->builder_ = std::move(builder); + dict_type_ = checked_cast<const DictionaryType*>(this->type_.get()); + value_type_ = checked_cast<const U*>(dict_type_->value_type().get()); + value_builder_ = checked_cast<BuilderType*>(this->builder_.get()); + return Status::OK(); + } + + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template <typename BaseConverter, template <typename...> class ConverterTrait> +struct MakeConverterImpl { + template <typename T, typename ConverterType = typename ConverterTrait<T>::type> + Status Visit(const T&) { + out.reset(new ConverterType()); + return out->Construct(std::move(type), std::move(options), pool); + } + + Status Visit(const DictionaryType& t) { + switch (t.value_type()->id()) { +#define DICTIONARY_CASE(TYPE) \ + case TYPE::type_id: \ + out = std::make_shared< \ + typename ConverterTrait<DictionaryType>::template type<TYPE>>(); \ + break; + DICTIONARY_CASE(BooleanType); + DICTIONARY_CASE(Int8Type); + DICTIONARY_CASE(Int16Type); + DICTIONARY_CASE(Int32Type); + DICTIONARY_CASE(Int64Type); + DICTIONARY_CASE(UInt8Type); + DICTIONARY_CASE(UInt16Type); + DICTIONARY_CASE(UInt32Type); + DICTIONARY_CASE(UInt64Type); + DICTIONARY_CASE(FloatType); + DICTIONARY_CASE(DoubleType); + DICTIONARY_CASE(BinaryType); + DICTIONARY_CASE(StringType); + DICTIONARY_CASE(FixedSizeBinaryType); + default: + return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(), + " not implemented"); + } + return out->Construct(std::move(type), std::move(options), pool); + } + + Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } + + std::shared_ptr<DataType> type; + typename BaseConverter::OptionsType options; + MemoryPool* pool; + std::shared_ptr<BaseConverter> out; +}; + +template <typename BaseConverter, template <typename...> class ConverterTrait> +static Result<std::shared_ptr<BaseConverter>> MakeConverter( + std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options, + MemoryPool* pool) { + MakeConverterImpl<BaseConverter, ConverterTrait> visitor{ + std::move(type), std::move(options), pool, nullptr}; Review comment: ```suggestion std::move(type), std::move(options), pool, NULLPTR}; ``` ########## File path: cpp/src/arrow/python/python_to_arrow.cc ########## @@ -329,985 +302,602 @@ struct ValueConverter<DurationType> { default: return Status::UnknownError("Invalid time unit"); } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr<DataType> numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { + return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result<int64_t> FromNumpy(PyObject* obj, TimeUnit::type unit) { - // validate that the numpy scalar has np.timedelta64 dtype - std::shared_ptr<DataType> type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != DurationType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); - } - // validate that the time units are matching - if (unit != checked_cast<const DurationType&>(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.timedelta64 objects with differing unit"); - } - // convert the numpy value - return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval; - } -}; - -template <typename Type> -struct ValueConverter<Type, enable_if_any_binary<Type>> { - static inline Result<PyBytesView> FromPython(PyObject* obj) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return std::move(view); - } -}; + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. -template <typename Type> -struct ValueConverter<Type, enable_if_string_like<Type>> { - static inline Result<PyBytesView> FromPython(PyObject* obj) { - // strict conversion, force output to be unicode / utf8 and validate that - // any binary values are utf8 - bool is_utf8 = false; - PyBytesView view; - - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); - if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); - } - return std::move(view); + static Result<PyBytesView> Convert(const BaseBinaryType*, const O&, I obj) { + return PyBytesView::FromString(obj); } - static inline Result<PyBytesView> FromPython(PyObject* obj, bool* is_utf8) { - PyBytesView view; - - // Non-strict conversion; keep track of whether values are unicode or bytes - if (PyUnicode_Check(obj)) { - *is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); + static Result<PyBytesView> Convert(const FixedSizeBinaryType* type, const O&, I obj) { + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); + if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) { + return std::move(view); } else { - // If not unicode or bytes, FromBinary will error - *is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); - } - return std::move(view); - } -}; - -template <typename Type> -struct ValueConverter<Type, enable_if_fixed_size_binary<Type>> { - static inline Result<PyBytesView> FromPython(PyObject* obj, int32_t byte_width) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - if (ARROW_PREDICT_FALSE(view.size != byte_width)) { std::stringstream ss; - ss << "expected to be length " << byte_width << " was " << view.size; + ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); - } else { - return std::move(view); } } -}; - -// ---------------------------------------------------------------------- -// Sequence converter base and CRTP "middle" subclasses - -class SeqConverter; - -// Forward-declare converter factory -Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas, - bool strict_conversions, bool ignore_timezone, - std::unique_ptr<SeqConverter>* out); - -// Marshal Python sequence (list, tuple, etc.) to Arrow array -class SeqConverter { - public: - virtual ~SeqConverter() = default; - - // Initialize the sequence converter with an ArrayBuilder created - // externally. The reason for this interface is that we have - // arrow::MakeBuilder which also creates child builders for nested types, so - // we have to pass in the child builders to child SeqConverter in the case of - // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; - - // Append a single null value to the builder - virtual Status AppendNull() = 0; - - // Append a valid value - virtual Status AppendValue(PyObject* seq) = 0; - - // Append a single python object handling Null values - virtual Status Append(PyObject* seq) = 0; - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status Extend(PyObject* seq, int64_t size) = 0; - - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; - - virtual Status Close() { - if (chunks_.size() == 0 || builder_->length() > 0) { - std::shared_ptr<Array> last_chunk; - RETURN_NOT_OK(builder_->Finish(&last_chunk)); - chunks_.emplace_back(std::move(last_chunk)); + template <typename T> + static enable_if_string<T, Result<PyBytesView>> Convert(const T*, const O& options, + I obj) { + if (options.strict) { + // Strict conversion, force output to be unicode / utf8 and validate that + // any binary values are utf8 + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true)); + if (!view.is_utf8) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } + return std::move(view); + } else { + // Non-strict conversion; keep track of whether values are unicode or bytes + return PyBytesView::FromString(obj); } - return Status::OK(); } - virtual Status GetResult(std::shared_ptr<ChunkedArray>* out) { - // Still some accumulated data in the builder. If there are no chunks, we - // always call Finish to deal with the edge case where a size-0 sequence - // was converted with a specific output type, like array([], type=t) - RETURN_NOT_OK(Close()); - *out = std::make_shared<ChunkedArray>(this->chunks_, builder_->type()); - return Status::OK(); + static Result<bool> Convert(const DataType* type, const O&, I obj) { + return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); } - - ArrayBuilder* builder() const { return builder_; } - - int num_chunks() const { return static_cast<int>(chunks_.size()); } - - protected: - ArrayBuilder* builder_; - bool unfinished_builder_; - std::vector<std::shared_ptr<Array>> chunks_; }; -template <typename Type, NullCoding null_coding = NullCoding::NONE_ONLY> -class TypedConverter : public SeqConverter { - public: - using BuilderType = typename TypeTraits<Type>::BuilderType; - - Status Init(ArrayBuilder* builder) override { - builder_ = builder; - DCHECK_NE(builder_, nullptr); - typed_builder_ = checked_cast<BuilderType*>(builder); - return Status::OK(); - } - - // Append a missing item (default implementation) - Status AppendNull() override { return this->typed_builder_->AppendNull(); } - - // Append null if the obj is None or pandas null otherwise the valid value - Status Append(PyObject* obj) override { - return NullChecker<null_coding>::Check(obj) ? AppendNull() : AppendValue(obj); - } - - Status Extend(PyObject* obj, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence( - obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); - } - - Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } +template <typename T> +Status Extend(T* converter, PyObject* values, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { + return converter->Append(item); + }); +} - protected: - BuilderType* typed_builder_; -}; +// Convert and append a sequence of values masked with a numpy array +template <typename T> +Status ExtendMasked(T* converter, PyObject* values, PyObject* mask, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return converter->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return converter->Append(item); // perhaps use AppendValue instead? + } + }); +} -// ---------------------------------------------------------------------- -// Sequence converter for null type +// The base Converter class is a mixin with predefined behavior and constructors. +using PyConverter = Converter<PyObject*, PyConversionOptions>; -template <NullCoding null_coding> -class NullConverter : public TypedConverter<NullType, null_coding> { - public: - Status AppendValue(PyObject* obj) override { - return internal::InvalidValue(obj, "converting to null type"); - } -}; +template <typename T, typename Enable = void> +class PyPrimitiveConverter; -// ---------------------------------------------------------------------- -// Sequence converter template for primitive (integer and floating point bool) types +template <typename T> +class PyListConverter; -template <typename Type, NullCoding null_coding> -class PrimitiveConverter : public TypedConverter<Type, null_coding> { - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter<Type>::FromPython(obj)); - return this->typed_builder_->Append(value); - } -}; +template <typename U, typename Enable = void> +class PyDictionaryConverter; -// ---------------------------------------------------------------------- -// Sequence converters for temporal types +class PyStructConverter; -template <typename Type, NullCoding null_coding> -class TimeConverter : public TypedConverter<Type, null_coding> { - public: - explicit TimeConverter(TimeUnit::type unit, bool ignore_timezone) - : unit_(unit), ignore_timezone_(ignore_timezone) {} - - // TODO(kszucs): support numpy values for date and time converters - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, - ValueConverter<Type>::FromPython(obj, unit_, ignore_timezone_)); - return this->typed_builder_->Append(value); - } +template <typename T, typename Enable = void> +struct PyConverterTrait; - protected: - TimeUnit::type unit_; - bool ignore_timezone_; +template <typename T> +struct PyConverterTrait< + T, enable_if_t<!is_nested_type<T>::value && !is_interval_type<T>::value && + !is_extension_type<T>::value>> { + using type = PyPrimitiveConverter<T>; }; -// TODO(kszucs): move it to the type_traits template <typename T> -struct NumpyType {}; +struct PyConverterTrait<T, enable_if_list_like<T>> { + using type = PyListConverter<T>; +}; template <> -struct NumpyType<TimestampType> { - static inline bool isnull(int64_t v) { - return internal::npy_traits<NPY_DATETIME>::isnull(v); - } +struct PyConverterTrait<StructType> { + using type = PyStructConverter; }; template <> -struct NumpyType<DurationType> { - static inline bool isnull(int64_t v) { - return internal::npy_traits<NPY_TIMEDELTA>::isnull(v); - } +struct PyConverterTrait<DictionaryType> { + template <typename T> + using type = PyDictionaryConverter<T>; }; -template <typename Type, NullCoding null_coding> -class TemporalConverter : public TimeConverter<Type, null_coding> { +template <typename T> +class PyPrimitiveConverter< + T, enable_if_t<is_null_type<T>::value || is_boolean_type<T>::value || + is_number_type<T>::value || is_decimal_type<T>::value || + is_date_type<T>::value || is_time_type<T>::value>> + : public PrimitiveConverter<T, PyConverter> { public: - using TimeConverter<Type, null_coding>::TimeConverter; - - Status AppendValue(PyObject* obj) override { - int64_t value; - if (PyArray_CheckAnyScalarExact(obj)) { - // convert np.datetime64 / np.timedelta64 depending on Type - ARROW_ASSIGN_OR_RAISE(value, ValueConverter<Type>::FromNumpy(obj, this->unit_)); - if (NumpyType<Type>::isnull(value)) { - // checks numpy NaT sentinel after conversion - return this->typed_builder_->AppendNull(); - } + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); } else { ARROW_ASSIGN_OR_RAISE( - value, - ValueConverter<Type>::FromPython( - obj, this->unit_, TimeConverter<Type, null_coding>::ignore_timezone_)); + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); } - return this->typed_builder_->Append(value); } }; -// ---------------------------------------------------------------------- -// Sequence converters for Binary, FixedSizeBinary, String - -template <typename Type, NullCoding null_coding> -class BinaryLikeConverter : public TypedConverter<Type, null_coding> { +template <typename T> +class PyPrimitiveConverter< + T, enable_if_t<is_timestamp_type<T>::value || is_duration_type<T>::value>> + : public PrimitiveConverter<T, PyConverter> { public: - using BuilderType = typename TypeTraits<Type>::BuilderType; - - inline Status AutoChunk(Py_ssize_t size) { - // did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(this->typed_builder_->value_data_length() + size > - BuilderType::memory_limit())) { - // builder would be full, so need to add a new chunk - std::shared_ptr<Array> chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); - } - return Status::OK(); - } - - Status AppendString(const PyBytesView& view) { - // check that the value fits in the datatype - if (view.size > BuilderType::memory_limit()) { - return Status::Invalid("string too large for datatype"); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + // Numpy NaT sentinels can be checked after the conversion + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { + return this->primitive_builder_->AppendNull(); + } else { + return this->primitive_builder_->Append(converted); + } } - DCHECK_GE(view.size, 0); - - // create a new chunk if the value would overflow the builder - RETURN_NOT_OK(AutoChunk(view.size)); - - // now we can safely append the value to the builder - RETURN_NOT_OK( - this->typed_builder_->Append(::arrow::util::string_view(view.bytes, view.size))); - - return Status::OK(); } - - protected: - // Create a single instance of PyBytesView here to prevent unnecessary object - // creation/destruction - PyBytesView string_view_; }; -template <typename Type, NullCoding null_coding> -class BinaryConverter : public BinaryLikeConverter<Type, null_coding> { +template <typename T> +class PyPrimitiveConverter<T, enable_if_binary<T>> + : public PrimitiveConverter<T, PyConverter> { public: - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto view, ValueConverter<Type>::FromPython(obj)); - return this->AppendString(view); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } } }; -template <NullCoding null_coding> -class FixedSizeBinaryConverter - : public BinaryLikeConverter<FixedSizeBinaryType, null_coding> { +template <typename T> +class PyPrimitiveConverter<T, enable_if_string_like<T>> + : public PrimitiveConverter<T, PyConverter> { public: - explicit FixedSizeBinaryConverter(int32_t byte_width) : byte_width_(byte_width) {} + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + if (!view.is_utf8) { + // observed binary value + observed_binary_ = true; + } + ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } + } - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE( - this->string_view_, - ValueConverter<FixedSizeBinaryType>::FromPython(obj, byte_width_)); - return this->AppendString(this->string_view_); + Result<std::shared_ptr<Array>> ToArray() override { + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray())); + if (observed_binary_) { + // if we saw any non-unicode, cast results to BinaryArray + auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton(); + return array->View(binary_type); + } else { + return array; + } } protected: - int32_t byte_width_; + bool observed_binary_ = false; }; -// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, -// otherwise we allow but return results as BinaryArray -template <typename Type, bool Strict, NullCoding null_coding> -class StringConverter : public BinaryLikeConverter<Type, null_coding> { +template <typename U> +class PyDictionaryConverter<U, enable_if_has_c_type<U>> + : public DictionaryConverter<U, PyConverter> { public: - StringConverter() : binary_count_(0) {} - - Status AppendValue(PyObject* obj) override { - if (Strict) { - // raise if the object is not unicode or not an utf-8 encoded bytes - ARROW_ASSIGN_OR_RAISE(this->string_view_, ValueConverter<Type>::FromPython(obj)); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); } else { - // keep track of whether values are unicode or bytes; if any bytes are - // observe, the result will be bytes - bool is_utf8; - ARROW_ASSIGN_OR_RAISE(this->string_view_, - ValueConverter<Type>::FromPython(obj, &is_utf8)); - if (!is_utf8) { - ++binary_count_; - } + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(converted); } - return this->AppendString(this->string_view_); } +}; - Status GetResult(std::shared_ptr<ChunkedArray>* out) override { - RETURN_NOT_OK(SeqConverter::GetResult(out)); - - // If we saw any non-unicode, cast results to BinaryArray - if (binary_count_) { - // We should have bailed out earlier - DCHECK(!Strict); - auto binary_type = TypeTraits<typename Type::PhysicalType>::type_singleton(); - return (*out)->View(binary_type).Value(out); +template <typename U> +class PyDictionaryConverter<U, enable_if_has_string_view<U>> + : public DictionaryConverter<U, PyConverter> { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(util::string_view(view.bytes, view.size)); } - return Status::OK(); } - - protected: - int64_t binary_count_; }; -// ---------------------------------------------------------------------- -// Convert lists (NumPy arrays containing lists or ndarrays as values) - // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ - case Type::TYPE: { \ - if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ - return value_converter_->Extend(obj, value_length); \ - } \ - return AppendNdarrayTypedItem<NUMPY_TYPE, ArrowType>(arr); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return Extend(this->value_converter_.get(), value, size); \ + } \ + return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \ } // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE) \ - case Type::TYPE: { \ - return value_converter_->Extend(obj, value_length); \ +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return Extend(this->value_converter_.get(), value, size); \ } -// Base class for ListConverter and FixedSizeListConverter (to have both work with CRTP) -template <typename TypeClass, NullCoding null_coding> -class BaseListConverter : public TypedConverter<TypeClass, null_coding> { +template <typename T> +class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> { public: - using BuilderType = typename TypeTraits<TypeClass>::BuilderType; - - explicit BaseListConverter(bool from_pandas, bool strict_conversions, - bool ignore_timezone) - : from_pandas_(from_pandas), - strict_conversions_(strict_conversions), - ignore_timezone_(ignore_timezone) {} - - Status Init(ArrayBuilder* builder) override { - this->builder_ = builder; - this->typed_builder_ = checked_cast<BuilderType*>(builder); - - this->value_type_ = checked_cast<const TypeClass&>(*builder->type()).value_type(); - RETURN_NOT_OK(GetConverter(value_type_, from_pandas_, strict_conversions_, - ignore_timezone_, &value_converter_)); - return this->value_converter_->Init(this->typed_builder_->value_builder()); - } + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->list_builder_->AppendNull(); + } - template <int NUMPY_TYPE, typename Type> - Status AppendNdarrayTypedItem(PyArrayObject* arr) { - using traits = internal::npy_traits<NUMPY_TYPE>; - using T = typename traits::value_type; - using ValueBuilderType = typename TypeTraits<Type>::BuilderType; + RETURN_NOT_OK(this->list_builder_->Append()); + if (PyArray_Check(value)) { + RETURN_NOT_OK(AppendNdarray(value)); + } else if (PySequence_Check(value)) { + RETURN_NOT_OK(AppendSequence(value)); + } else { + return internal::InvalidType( + value, "was not a sequence or recognized null for conversion to list type"); + } - const bool null_sentinels_possible = - // Always treat Numpy's NaT as null - NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || - // Observing pandas's null sentinels - (from_pandas_ && traits::supports_nulls); + return ValidateBuilder(this->list_type_); + } - auto child_builder = checked_cast<ValueBuilderType*>(value_converter_->builder()); + protected: + Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } Review comment: Since MapArray has the same offset structure as ListArray it should probably have non empty overflow validation ########## File path: cpp/src/arrow/python/python_to_arrow.cc ########## @@ -329,985 +302,602 @@ struct ValueConverter<DurationType> { default: return Status::UnknownError("Invalid time unit"); } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr<DataType> numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { + return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result<int64_t> FromNumpy(PyObject* obj, TimeUnit::type unit) { - // validate that the numpy scalar has np.timedelta64 dtype - std::shared_ptr<DataType> type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != DurationType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); - } - // validate that the time units are matching - if (unit != checked_cast<const DurationType&>(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.timedelta64 objects with differing unit"); - } - // convert the numpy value - return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval; - } -}; - -template <typename Type> -struct ValueConverter<Type, enable_if_any_binary<Type>> { - static inline Result<PyBytesView> FromPython(PyObject* obj) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return std::move(view); - } -}; + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. -template <typename Type> -struct ValueConverter<Type, enable_if_string_like<Type>> { - static inline Result<PyBytesView> FromPython(PyObject* obj) { - // strict conversion, force output to be unicode / utf8 and validate that - // any binary values are utf8 - bool is_utf8 = false; - PyBytesView view; - - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); - if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); - } - return std::move(view); + static Result<PyBytesView> Convert(const BaseBinaryType*, const O&, I obj) { + return PyBytesView::FromString(obj); } - static inline Result<PyBytesView> FromPython(PyObject* obj, bool* is_utf8) { - PyBytesView view; - - // Non-strict conversion; keep track of whether values are unicode or bytes - if (PyUnicode_Check(obj)) { - *is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); + static Result<PyBytesView> Convert(const FixedSizeBinaryType* type, const O&, I obj) { + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); + if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) { + return std::move(view); } else { - // If not unicode or bytes, FromBinary will error - *is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); - } - return std::move(view); - } -}; - -template <typename Type> -struct ValueConverter<Type, enable_if_fixed_size_binary<Type>> { - static inline Result<PyBytesView> FromPython(PyObject* obj, int32_t byte_width) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - if (ARROW_PREDICT_FALSE(view.size != byte_width)) { std::stringstream ss; - ss << "expected to be length " << byte_width << " was " << view.size; + ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); - } else { - return std::move(view); } } -}; - -// ---------------------------------------------------------------------- -// Sequence converter base and CRTP "middle" subclasses - -class SeqConverter; - -// Forward-declare converter factory -Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas, - bool strict_conversions, bool ignore_timezone, - std::unique_ptr<SeqConverter>* out); - -// Marshal Python sequence (list, tuple, etc.) to Arrow array -class SeqConverter { - public: - virtual ~SeqConverter() = default; - - // Initialize the sequence converter with an ArrayBuilder created - // externally. The reason for this interface is that we have - // arrow::MakeBuilder which also creates child builders for nested types, so - // we have to pass in the child builders to child SeqConverter in the case of - // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; - - // Append a single null value to the builder - virtual Status AppendNull() = 0; - - // Append a valid value - virtual Status AppendValue(PyObject* seq) = 0; - - // Append a single python object handling Null values - virtual Status Append(PyObject* seq) = 0; - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status Extend(PyObject* seq, int64_t size) = 0; - - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; - - virtual Status Close() { - if (chunks_.size() == 0 || builder_->length() > 0) { - std::shared_ptr<Array> last_chunk; - RETURN_NOT_OK(builder_->Finish(&last_chunk)); - chunks_.emplace_back(std::move(last_chunk)); + template <typename T> + static enable_if_string<T, Result<PyBytesView>> Convert(const T*, const O& options, + I obj) { + if (options.strict) { + // Strict conversion, force output to be unicode / utf8 and validate that + // any binary values are utf8 + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true)); + if (!view.is_utf8) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } + return std::move(view); + } else { + // Non-strict conversion; keep track of whether values are unicode or bytes + return PyBytesView::FromString(obj); } - return Status::OK(); } - virtual Status GetResult(std::shared_ptr<ChunkedArray>* out) { - // Still some accumulated data in the builder. If there are no chunks, we - // always call Finish to deal with the edge case where a size-0 sequence - // was converted with a specific output type, like array([], type=t) - RETURN_NOT_OK(Close()); - *out = std::make_shared<ChunkedArray>(this->chunks_, builder_->type()); - return Status::OK(); + static Result<bool> Convert(const DataType* type, const O&, I obj) { + return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); } - - ArrayBuilder* builder() const { return builder_; } - - int num_chunks() const { return static_cast<int>(chunks_.size()); } - - protected: - ArrayBuilder* builder_; - bool unfinished_builder_; - std::vector<std::shared_ptr<Array>> chunks_; }; -template <typename Type, NullCoding null_coding = NullCoding::NONE_ONLY> -class TypedConverter : public SeqConverter { - public: - using BuilderType = typename TypeTraits<Type>::BuilderType; - - Status Init(ArrayBuilder* builder) override { - builder_ = builder; - DCHECK_NE(builder_, nullptr); - typed_builder_ = checked_cast<BuilderType*>(builder); - return Status::OK(); - } - - // Append a missing item (default implementation) - Status AppendNull() override { return this->typed_builder_->AppendNull(); } - - // Append null if the obj is None or pandas null otherwise the valid value - Status Append(PyObject* obj) override { - return NullChecker<null_coding>::Check(obj) ? AppendNull() : AppendValue(obj); - } - - Status Extend(PyObject* obj, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence( - obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); - } - - Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } +template <typename T> +Status Extend(T* converter, PyObject* values, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { + return converter->Append(item); + }); +} - protected: - BuilderType* typed_builder_; -}; +// Convert and append a sequence of values masked with a numpy array +template <typename T> +Status ExtendMasked(T* converter, PyObject* values, PyObject* mask, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return converter->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return converter->Append(item); // perhaps use AppendValue instead? + } + }); +} -// ---------------------------------------------------------------------- -// Sequence converter for null type +// The base Converter class is a mixin with predefined behavior and constructors. +using PyConverter = Converter<PyObject*, PyConversionOptions>; -template <NullCoding null_coding> -class NullConverter : public TypedConverter<NullType, null_coding> { - public: - Status AppendValue(PyObject* obj) override { - return internal::InvalidValue(obj, "converting to null type"); - } -}; +template <typename T, typename Enable = void> +class PyPrimitiveConverter; -// ---------------------------------------------------------------------- -// Sequence converter template for primitive (integer and floating point bool) types +template <typename T> +class PyListConverter; -template <typename Type, NullCoding null_coding> -class PrimitiveConverter : public TypedConverter<Type, null_coding> { - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter<Type>::FromPython(obj)); - return this->typed_builder_->Append(value); - } -}; +template <typename U, typename Enable = void> +class PyDictionaryConverter; -// ---------------------------------------------------------------------- -// Sequence converters for temporal types +class PyStructConverter; -template <typename Type, NullCoding null_coding> -class TimeConverter : public TypedConverter<Type, null_coding> { - public: - explicit TimeConverter(TimeUnit::type unit, bool ignore_timezone) - : unit_(unit), ignore_timezone_(ignore_timezone) {} - - // TODO(kszucs): support numpy values for date and time converters - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, - ValueConverter<Type>::FromPython(obj, unit_, ignore_timezone_)); - return this->typed_builder_->Append(value); - } +template <typename T, typename Enable = void> +struct PyConverterTrait; - protected: - TimeUnit::type unit_; - bool ignore_timezone_; +template <typename T> +struct PyConverterTrait< + T, enable_if_t<!is_nested_type<T>::value && !is_interval_type<T>::value && + !is_extension_type<T>::value>> { + using type = PyPrimitiveConverter<T>; }; -// TODO(kszucs): move it to the type_traits template <typename T> -struct NumpyType {}; +struct PyConverterTrait<T, enable_if_list_like<T>> { + using type = PyListConverter<T>; +}; template <> -struct NumpyType<TimestampType> { - static inline bool isnull(int64_t v) { - return internal::npy_traits<NPY_DATETIME>::isnull(v); - } +struct PyConverterTrait<StructType> { + using type = PyStructConverter; }; template <> -struct NumpyType<DurationType> { - static inline bool isnull(int64_t v) { - return internal::npy_traits<NPY_TIMEDELTA>::isnull(v); - } +struct PyConverterTrait<DictionaryType> { + template <typename T> + using type = PyDictionaryConverter<T>; }; -template <typename Type, NullCoding null_coding> -class TemporalConverter : public TimeConverter<Type, null_coding> { +template <typename T> +class PyPrimitiveConverter< + T, enable_if_t<is_null_type<T>::value || is_boolean_type<T>::value || + is_number_type<T>::value || is_decimal_type<T>::value || + is_date_type<T>::value || is_time_type<T>::value>> + : public PrimitiveConverter<T, PyConverter> { public: - using TimeConverter<Type, null_coding>::TimeConverter; - - Status AppendValue(PyObject* obj) override { - int64_t value; - if (PyArray_CheckAnyScalarExact(obj)) { - // convert np.datetime64 / np.timedelta64 depending on Type - ARROW_ASSIGN_OR_RAISE(value, ValueConverter<Type>::FromNumpy(obj, this->unit_)); - if (NumpyType<Type>::isnull(value)) { - // checks numpy NaT sentinel after conversion - return this->typed_builder_->AppendNull(); - } + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); } else { ARROW_ASSIGN_OR_RAISE( - value, - ValueConverter<Type>::FromPython( - obj, this->unit_, TimeConverter<Type, null_coding>::ignore_timezone_)); + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); } - return this->typed_builder_->Append(value); } }; -// ---------------------------------------------------------------------- -// Sequence converters for Binary, FixedSizeBinary, String - -template <typename Type, NullCoding null_coding> -class BinaryLikeConverter : public TypedConverter<Type, null_coding> { +template <typename T> +class PyPrimitiveConverter< + T, enable_if_t<is_timestamp_type<T>::value || is_duration_type<T>::value>> + : public PrimitiveConverter<T, PyConverter> { public: - using BuilderType = typename TypeTraits<Type>::BuilderType; - - inline Status AutoChunk(Py_ssize_t size) { - // did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(this->typed_builder_->value_data_length() + size > - BuilderType::memory_limit())) { - // builder would be full, so need to add a new chunk - std::shared_ptr<Array> chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); - } - return Status::OK(); - } - - Status AppendString(const PyBytesView& view) { - // check that the value fits in the datatype - if (view.size > BuilderType::memory_limit()) { - return Status::Invalid("string too large for datatype"); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + // Numpy NaT sentinels can be checked after the conversion + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { + return this->primitive_builder_->AppendNull(); + } else { + return this->primitive_builder_->Append(converted); + } } - DCHECK_GE(view.size, 0); - - // create a new chunk if the value would overflow the builder - RETURN_NOT_OK(AutoChunk(view.size)); - - // now we can safely append the value to the builder - RETURN_NOT_OK( - this->typed_builder_->Append(::arrow::util::string_view(view.bytes, view.size))); - - return Status::OK(); } - - protected: - // Create a single instance of PyBytesView here to prevent unnecessary object - // creation/destruction - PyBytesView string_view_; }; -template <typename Type, NullCoding null_coding> -class BinaryConverter : public BinaryLikeConverter<Type, null_coding> { +template <typename T> +class PyPrimitiveConverter<T, enable_if_binary<T>> + : public PrimitiveConverter<T, PyConverter> { public: - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto view, ValueConverter<Type>::FromPython(obj)); - return this->AppendString(view); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } } }; -template <NullCoding null_coding> -class FixedSizeBinaryConverter - : public BinaryLikeConverter<FixedSizeBinaryType, null_coding> { +template <typename T> +class PyPrimitiveConverter<T, enable_if_string_like<T>> + : public PrimitiveConverter<T, PyConverter> { public: - explicit FixedSizeBinaryConverter(int32_t byte_width) : byte_width_(byte_width) {} + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + if (!view.is_utf8) { + // observed binary value + observed_binary_ = true; + } + ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } + } - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE( - this->string_view_, - ValueConverter<FixedSizeBinaryType>::FromPython(obj, byte_width_)); - return this->AppendString(this->string_view_); + Result<std::shared_ptr<Array>> ToArray() override { + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray())); + if (observed_binary_) { + // if we saw any non-unicode, cast results to BinaryArray + auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton(); + return array->View(binary_type); + } else { + return array; + } } protected: - int32_t byte_width_; + bool observed_binary_ = false; }; -// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, -// otherwise we allow but return results as BinaryArray -template <typename Type, bool Strict, NullCoding null_coding> -class StringConverter : public BinaryLikeConverter<Type, null_coding> { +template <typename U> +class PyDictionaryConverter<U, enable_if_has_c_type<U>> + : public DictionaryConverter<U, PyConverter> { public: - StringConverter() : binary_count_(0) {} - - Status AppendValue(PyObject* obj) override { - if (Strict) { - // raise if the object is not unicode or not an utf-8 encoded bytes - ARROW_ASSIGN_OR_RAISE(this->string_view_, ValueConverter<Type>::FromPython(obj)); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); } else { - // keep track of whether values are unicode or bytes; if any bytes are - // observe, the result will be bytes - bool is_utf8; - ARROW_ASSIGN_OR_RAISE(this->string_view_, - ValueConverter<Type>::FromPython(obj, &is_utf8)); - if (!is_utf8) { - ++binary_count_; - } + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(converted); } - return this->AppendString(this->string_view_); } +}; - Status GetResult(std::shared_ptr<ChunkedArray>* out) override { - RETURN_NOT_OK(SeqConverter::GetResult(out)); - - // If we saw any non-unicode, cast results to BinaryArray - if (binary_count_) { - // We should have bailed out earlier - DCHECK(!Strict); - auto binary_type = TypeTraits<typename Type::PhysicalType>::type_singleton(); - return (*out)->View(binary_type).Value(out); +template <typename U> +class PyDictionaryConverter<U, enable_if_has_string_view<U>> + : public DictionaryConverter<U, PyConverter> { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(util::string_view(view.bytes, view.size)); } - return Status::OK(); } - - protected: - int64_t binary_count_; }; -// ---------------------------------------------------------------------- -// Convert lists (NumPy arrays containing lists or ndarrays as values) - // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ - case Type::TYPE: { \ - if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ - return value_converter_->Extend(obj, value_length); \ - } \ - return AppendNdarrayTypedItem<NUMPY_TYPE, ArrowType>(arr); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return Extend(this->value_converter_.get(), value, size); \ + } \ + return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \ } // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE) \ - case Type::TYPE: { \ - return value_converter_->Extend(obj, value_length); \ +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return Extend(this->value_converter_.get(), value, size); \ } Review comment: Since these macros are only used in PyListConverter::AppendNdarray, please #define and #undef them inside that scope ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
