Repository: arrow Updated Branches: refs/heads/master 6999dbd1e -> 2c5b412c2
http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/pandas_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h deleted file mode 100644 index 45c8a1a..0000000 --- a/cpp/src/arrow/python/pandas_convert.h +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#ifndef ARROW_PYTHON_ADAPTERS_PANDAS_H -#define ARROW_PYTHON_ADAPTERS_PANDAS_H - -#include "arrow/python/platform.h" - -#include <memory> -#include <string> - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class Column; -class DataType; -class MemoryPool; -class Status; -class Table; - -namespace py { - -ARROW_EXPORT -Status ConvertArrayToPandas( - const std::shared_ptr<Array>& arr, PyObject* py_ref, PyObject** out); - -ARROW_EXPORT -Status ConvertColumnToPandas( - const std::shared_ptr<Column>& col, PyObject* py_ref, PyObject** out); - -struct PandasOptions { - bool strings_to_categorical; -}; - -// Convert a whole table as efficiently as possible to a pandas.DataFrame. -// -// The returned Python object is a list of tuples consisting of the exact 2D -// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. -// -// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) -ARROW_EXPORT -Status ConvertTableToPandas( - const std::shared_ptr<Table>& table, int nthreads, PyObject** out); - -ARROW_EXPORT -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out); - -/// Convert dtype=object arrays. If target data type is not known, pass a type -/// with nullptr -ARROW_EXPORT -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_ADAPTERS_PANDAS_H http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/pandas_to_arrow.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc new file mode 100644 index 0000000..1368c36 --- /dev/null +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -0,0 +1,1099 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for pandas conversion via NumPy + +#include "arrow/python/numpy_interop.h" + +#include "arrow/python/pandas_to_arrow.h" + +#include <cmath> +#include <cstdint> +#include <limits> +#include <memory> +#include <sstream> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/visitor_inline.h" + +#include "arrow/python/builtin_convert.h" +#include "arrow/python/common.h" +#include "arrow/python/config.h" +#include "arrow/python/helpers.h" +#include "arrow/python/numpy-internal.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/type_traits.h" +#include "arrow/python/util/datetime.h" + +namespace arrow { +namespace py { + +// ---------------------------------------------------------------------- +// Conversion utilities + +static inline bool PyFloat_isnan(const PyObject* obj) { + if (PyFloat_Check(obj)) { + double val = PyFloat_AS_DOUBLE(obj); + return val != val; + } else { + return false; + } +} +static inline bool PandasObjectIsNull(const PyObject* obj) { + return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); +} + +static inline bool PyObject_is_string(const PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + return PyUnicode_Check(obj) || PyBytes_Check(obj); +#else + return PyString_Check(obj) || PyUnicode_Check(obj); +#endif +} + +static inline bool PyObject_is_float(const PyObject* obj) { + return PyFloat_Check(obj); +} + +static inline bool PyObject_is_integer(const PyObject* obj) { + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); +} + +template <int TYPE> +static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { + typedef npy_traits<TYPE> traits; + typedef typename traits::value_type T; + + int64_t null_count = 0; + + Ndarray1DIndexer<T> values(arr); + + // TODO(wesm): striding + for (int i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + ++null_count; + } else { + BitUtil::SetBit(bitmap, i); + } + } + + return null_count; +} + +// Returns null count +static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { + int64_t null_count = 0; + + Ndarray1DIndexer<uint8_t> mask_values(mask); + for (int i = 0; i < length; ++i) { + if (mask_values[i]) { + ++null_count; + } else { + BitUtil::SetBit(bitmap, i); + } + } + return null_count; +} + +template <int TYPE> +static int64_t ValuesToValidBytes( + const void* data, int64_t length, uint8_t* valid_bytes) { + typedef npy_traits<TYPE> traits; + typedef typename traits::value_type T; + + int64_t null_count = 0; + const T* values = reinterpret_cast<const T*>(data); + + // TODO(wesm): striding + for (int i = 0; i < length; ++i) { + valid_bytes[i] = !traits::isnull(values[i]); + if (traits::isnull(values[i])) null_count++; + } + + return null_count; +} + +Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { + if (PyArray_NDIM(numpy_array) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + if (PyArray_DESCR(numpy_array)->type_num != np_type) { + return Status::Invalid("can only handle exact conversions"); + } + + npy_intp* astrides = PyArray_STRIDES(numpy_array); + if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) { + return Status::Invalid("No support for strided arrays in lists yet"); + } + return Status::OK(); +} + +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max(); + +/// Append as many string objects from NumPy arrays to a `StringBuilder` as we +/// can fit +/// +/// \param[in] offset starting offset for appending +/// \param[out] values_consumed ending offset where we stopped appending. Will +/// be length of arr if fully consumed +/// \param[out] have_bytes true if we encountered any PyBytes object +static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64_t offset, + StringBuilder* builder, int64_t* end_offset, bool* have_bytes) { + PyObject* obj; + + Ndarray1DIndexer<PyObject*> objects(arr); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask != nullptr) { + mask_values.Init(mask); + have_mask = true; + } + + for (; offset < objects.size(); ++offset) { + OwnedRef tmp_obj; + obj = objects[offset]; + if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } else if (PyUnicode_Check(obj)) { + obj = PyUnicode_AsUTF8String(obj); + if (obj == NULL) { + PyErr_Clear(); + return Status::Invalid("failed converting unicode to UTF8"); + } + tmp_obj.reset(obj); + } else if (PyBytes_Check(obj)) { + *have_bytes = true; + } else { + std::stringstream ss; + ss << "Error converting to Python objects to String/UTF8: "; + RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss)); + return Status::Invalid(ss.str()); + } + + const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj)); + if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + break; + } + RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); + } + + // If we consumed the whole array, this will be the length of arr + *end_offset = offset; + return Status::OK(); +} + +static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mask, + int byte_width, int64_t offset, FixedSizeBinaryBuilder* builder, + int64_t* end_offset) { + PyObject* obj; + + Ndarray1DIndexer<PyObject*> objects(arr); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask != nullptr) { + mask_values.Init(mask); + have_mask = true; + } + + for (; offset < objects.size(); ++offset) { + OwnedRef tmp_obj; + obj = objects[offset]; + if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } else if (PyUnicode_Check(obj)) { + obj = PyUnicode_AsUTF8String(obj); + if (obj == NULL) { + PyErr_Clear(); + return Status::Invalid("failed converting unicode to UTF8"); + } + + tmp_obj.reset(obj); + } else if (!PyBytes_Check(obj)) { + std::stringstream ss; + ss << "Error converting to Python objects to FixedSizeBinary: "; + RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss)); + return Status::Invalid(ss.str()); + } + + RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width)); + if (ARROW_PREDICT_FALSE( + builder->value_data_length() + byte_width > kBinaryMemoryLimit)) { + break; + } + RETURN_NOT_OK( + builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)))); + } + + // If we consumed the whole array, this will be the length of arr + *end_offset = offset; + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Conversion from NumPy-in-Pandas to Arrow + +class PandasConverter { + public: + PandasConverter( + MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr<DataType>& type) + : pool_(pool), + type_(type), + arr_(reinterpret_cast<PyArrayObject*>(ao)), + mask_(nullptr) { + if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast<PyArrayObject*>(mo); } + length_ = static_cast<int64_t>(PyArray_SIZE(arr_)); + } + + bool is_strided() const { + npy_intp* astrides = PyArray_STRIDES(arr_); + return astrides[0] != PyArray_DESCR(arr_)->elsize; + } + + Status InitNullBitmap() { + int64_t null_bytes = BitUtil::BytesForBits(length_); + + null_bitmap_ = std::make_shared<PoolBuffer>(pool_); + RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); + + null_bitmap_data_ = null_bitmap_->mutable_data(); + memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes)); + + return Status::OK(); + } + + // ---------------------------------------------------------------------- + // Traditional visitor conversion for non-object arrays + + template <typename ArrowType> + Status ConvertData(std::shared_ptr<Buffer>* data); + + template <typename T> + Status PushBuilderResult(T* builder) { + std::shared_ptr<Array> out; + RETURN_NOT_OK(builder->Finish(&out)); + out_arrays_.emplace_back(out); + return Status::OK(); + } + + Status PushArray(const std::shared_ptr<internal::ArrayData>& data) { + std::shared_ptr<Array> result; + RETURN_NOT_OK(internal::MakeArray(data, &result)); + out_arrays_.emplace_back(std::move(result)); + return Status::OK(); + } + + template <typename ArrowType> + Status VisitNative() { + using traits = arrow_traits<ArrowType::type_id>; + + if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); } + + std::shared_ptr<Buffer> data; + RETURN_NOT_OK(ConvertData<ArrowType>(&data)); + + int64_t null_count = 0; + if (mask_ != nullptr) { + null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); + } else if (traits::supports_nulls) { + // TODO(wesm): this presumes the NumPy C type and arrow C type are the + // same + null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_); + } + + BufferVector buffers = {null_bitmap_, data}; + return PushArray(std::make_shared<internal::ArrayData>( + type_, length_, std::move(buffers), null_count, 0)); + } + + template <typename T> + typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value || + std::is_same<BooleanType, T>::value, + Status>::type + Visit(const T& type) { + return VisitNative<T>(); + } + + Status Visit(const Date32Type& type) { return VisitNative<Int32Type>(); } + Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); } + Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); } + Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); } + Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); } + + Status TypeNotImplemented(std::string type_name) { + std::stringstream ss; + ss << "PandasConverter doesn't implement <" << type_name << "> conversion. "; + return Status::NotImplemented(ss.str()); + } + + Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const FixedSizeBinaryType& type) { + return TypeNotImplemented(type.ToString()); + } + + Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } + + Status Convert() { + if (PyArray_NDIM(arr_) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + if (type_ == nullptr) { return Status::Invalid("Must pass data type"); } + + // Visit the type to perform conversion + return VisitTypeInline(*type_, this); + } + + const std::vector<std::shared_ptr<Array>>& result() const { return out_arrays_; } + + // ---------------------------------------------------------------------- + // Conversion logic for various object dtype arrays + + template <int ITEM_TYPE, typename ArrowType> + Status ConvertTypedLists( + const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list); + + template <typename ArrowType> + Status ConvertDates(); + + Status ConvertBooleans(); + Status ConvertObjectStrings(); + Status ConvertObjectFloats(); + Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type); + Status ConvertObjectIntegers(); + Status ConvertLists(const std::shared_ptr<DataType>& type); + Status ConvertLists( + const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list); + Status ConvertObjects(); + Status ConvertDecimals(); + Status ConvertTimes(); + + protected: + MemoryPool* pool_; + std::shared_ptr<DataType> type_; + PyArrayObject* arr_; + PyArrayObject* mask_; + int64_t length_; + + // Used in visitor pattern + std::vector<std::shared_ptr<Array>> out_arrays_; + + std::shared_ptr<ResizableBuffer> null_bitmap_; + uint8_t* null_bitmap_data_; +}; + +template <typename T> +void CopyStrided(T* input_data, int64_t length, int64_t stride, T* output_data) { + // Passing input_data as non-const is a concession to PyObject* + int64_t j = 0; + for (int64_t i = 0; i < length; ++i) { + output_data[i] = input_data[j]; + j += stride; + } +} + +template <> +void CopyStrided<PyObject*>( + PyObject** input_data, int64_t length, int64_t stride, PyObject** output_data) { + int64_t j = 0; + for (int64_t i = 0; i < length; ++i) { + output_data[i] = input_data[j]; + if (output_data[i] != nullptr) { Py_INCREF(output_data[i]); } + j += stride; + } +} + +template <typename ArrowType> +inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) { + using traits = arrow_traits<ArrowType::type_id>; + using T = typename traits::T; + + // Handle LONGLONG->INT64 and other fun things + int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); + + if (numpy_type_size(traits::npy_type) != numpy_type_size(type_num_compat)) { + return Status::NotImplemented("NumPy type casts not yet implemented"); + } + + if (is_strided()) { + // Strided, must copy into new contiguous memory + const int64_t stride = PyArray_STRIDES(arr_)[0]; + const int64_t stride_elements = stride / sizeof(T); + + auto new_buffer = std::make_shared<PoolBuffer>(pool_); + RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_)); + CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr_)), length_, stride_elements, + reinterpret_cast<T*>(new_buffer->mutable_data())); + *data = new_buffer; + } else { + // Can zero-copy + *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_)); + } + return Status::OK(); +} + +template <> +inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) { + int64_t nbytes = BitUtil::BytesForBits(length_); + auto buffer = std::make_shared<PoolBuffer>(pool_); + RETURN_NOT_OK(buffer->Resize(nbytes)); + + Ndarray1DIndexer<uint8_t> values(arr_); + + uint8_t* bitmap = buffer->mutable_data(); + + memset(bitmap, 0, nbytes); + for (int i = 0; i < length_; ++i) { + if (values[i] > 0) { BitUtil::SetBit(bitmap, i); } + } + + *data = buffer; + return Status::OK(); +} + +template <typename T> +struct UnboxDate {}; + +template <> +struct UnboxDate<Date32Type> { + static int32_t Unbox(PyObject* obj) { + return PyDate_to_days(reinterpret_cast<PyDateTime_Date*>(obj)); + } +}; + +template <> +struct UnboxDate<Date64Type> { + static int64_t Unbox(PyObject* obj) { + return PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(obj)); + } +}; + +template <typename ArrowType> +Status PandasConverter::ConvertDates() { + PyAcquireGIL lock; + + using BuilderType = typename TypeTraits<ArrowType>::BuilderType; + + Ndarray1DIndexer<PyObject*> objects(arr_); + + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } + + BuilderType builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + /// We have to run this in this compilation unit, since we cannot use the + /// datetime API otherwise + PyDateTime_IMPORT; + + PyObject* obj; + for (int64_t i = 0; i < length_; ++i) { + obj = objects[i]; + if (PyDate_CheckExact(obj)) { + RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj))); + } else if (PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + std::stringstream ss; + ss << "Error converting from Python objects to " << type_->ToString() << ": "; + RETURN_NOT_OK(InvalidConversion(obj, "datetime.date", &ss)); + return Status::Invalid(ss.str()); + } + } + + return PushBuilderResult(&builder); +} + +#define CONVERT_DECIMAL_CASE(bit_width, builder, object) \ + case bit_width: { \ + decimal::Decimal##bit_width d; \ + std::string string_out; \ + RETURN_NOT_OK(PythonDecimalToString((object), &string_out)); \ + RETURN_NOT_OK(FromString(string_out, &d)); \ + RETURN_NOT_OK((builder).Append(d)); \ + break; \ + } + +Status PandasConverter::ConvertDecimals() { + PyAcquireGIL lock; + + // Import the decimal module and Decimal class + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(ImportModule("decimal", &decimal)); + RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + + PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + PyObject* object = objects[0]; + + int precision; + int scale; + + RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); + + type_ = std::make_shared<DecimalType>(precision, scale); + + const int bit_width = std::dynamic_pointer_cast<DecimalType>(type_)->bit_width(); + DecimalBuilder builder(pool_, type_); + RETURN_NOT_OK(builder.Resize(length_)); + + for (int64_t i = 0; i < length_; ++i) { + object = objects[i]; + if (PyObject_IsInstance(object, Decimal.obj())) { + switch (bit_width) { + CONVERT_DECIMAL_CASE(32, builder, object) + CONVERT_DECIMAL_CASE(64, builder, object) + CONVERT_DECIMAL_CASE(128, builder, object) + default: + break; + } + } else if (PandasObjectIsNull(object)) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + std::stringstream ss; + ss << "Error converting from Python objects to " << type_->ToString() << ": "; + RETURN_NOT_OK(InvalidConversion(object, "decimal.Decimal", &ss)); + return Status::Invalid(ss.str()); + } + } + return PushBuilderResult(&builder); +} + +Status PandasConverter::ConvertTimes() { + // Convert array of datetime.time objects to Arrow + PyAcquireGIL lock; + PyDateTime_IMPORT; + + PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + + // datetime.time stores microsecond resolution + Time64Builder builder(pool_, ::arrow::time64(TimeUnit::MICRO)); + RETURN_NOT_OK(builder.Resize(length_)); + + PyObject* obj; + for (int64_t i = 0; i < length_; ++i) { + obj = objects[i]; + if (PyTime_Check(obj)) { + RETURN_NOT_OK(builder.Append(PyTime_to_us(obj))); + } else if (PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + std::stringstream ss; + ss << "Error converting from Python objects to " << type_->ToString() << ": "; + RETURN_NOT_OK(InvalidConversion(obj, "datetime.time", &ss)); + return Status::Invalid(ss.str()); + } + } + return PushBuilderResult(&builder); +} + +#undef CONVERT_DECIMAL_CASE + +Status PandasConverter::ConvertObjectStrings() { + PyAcquireGIL lock; + + // The output type at this point is inconclusive because there may be bytes + // and unicode mixed in the object array + StringBuilder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + bool global_have_bytes = false; + int64_t offset = 0; + while (offset < length_) { + bool chunk_have_bytes = false; + RETURN_NOT_OK( + AppendObjectStrings(arr_, mask_, offset, &builder, &offset, &chunk_have_bytes)); + + global_have_bytes = global_have_bytes | chunk_have_bytes; + std::shared_ptr<Array> chunk; + RETURN_NOT_OK(builder.Finish(&chunk)); + out_arrays_.emplace_back(std::move(chunk)); + } + + // If we saw PyBytes, convert everything to BinaryArray + if (global_have_bytes) { + for (size_t i = 0; i < out_arrays_.size(); ++i) { + auto binary_data = out_arrays_[i]->data()->ShallowCopy(); + binary_data->type = ::arrow::binary(); + out_arrays_[i] = std::make_shared<BinaryArray>(binary_data); + } + } + return Status::OK(); +} + +Status PandasConverter::ConvertObjectFloats() { + PyAcquireGIL lock; + + Ndarray1DIndexer<PyObject*> objects(arr_); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + DoubleBuilder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyFloat_Check(obj)) { + double val = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); + } else { + std::stringstream ss; + ss << "Error converting from Python objects to " << type_->ToString() << ": "; + RETURN_NOT_OK(InvalidConversion(obj, "float", &ss)); + return Status::Invalid(ss.str()); + } + } + + return PushBuilderResult(&builder); +} + +Status PandasConverter::ConvertObjectIntegers() { + PyAcquireGIL lock; + + Int64Builder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer<PyObject*> objects(arr_); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyObject_is_integer(obj)) { + const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj)); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); + } else { + std::stringstream ss; + ss << "Error converting from Python objects to " << type_->ToString() << ": "; + RETURN_NOT_OK(InvalidConversion(obj, "integer", &ss)); + return Status::Invalid(ss.str()); + } + } + + return PushBuilderResult(&builder); +} + +Status PandasConverter::ConvertObjectFixedWidthBytes( + const std::shared_ptr<DataType>& type) { + PyAcquireGIL lock; + + int32_t byte_width = static_cast<const FixedSizeBinaryType&>(*type).byte_width(); + + // The output type at this point is inconclusive because there may be bytes + // and unicode mixed in the object array + FixedSizeBinaryBuilder builder(pool_, type); + RETURN_NOT_OK(builder.Resize(length_)); + + int64_t offset = 0; + while (offset < length_) { + RETURN_NOT_OK( + AppendObjectFixedWidthBytes(arr_, mask_, byte_width, offset, &builder, &offset)); + + std::shared_ptr<Array> chunk; + RETURN_NOT_OK(builder.Finish(&chunk)); + out_arrays_.emplace_back(std::move(chunk)); + } + return Status::OK(); +} + +Status PandasConverter::ConvertBooleans() { + PyAcquireGIL lock; + + Ndarray1DIndexer<PyObject*> objects(arr_); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + int64_t nbytes = BitUtil::BytesForBits(length_); + auto data = std::make_shared<PoolBuffer>(pool_); + RETURN_NOT_OK(data->Resize(nbytes)); + uint8_t* bitmap = data->mutable_data(); + memset(bitmap, 0, nbytes); + + int64_t null_count = 0; + PyObject* obj; + for (int64_t i = 0; i < length_; ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + ++null_count; + } else if (obj == Py_True) { + BitUtil::SetBit(bitmap, i); + BitUtil::SetBit(null_bitmap_data_, i); + } else if (obj == Py_False) { + BitUtil::SetBit(null_bitmap_data_, i); + } else { + std::stringstream ss; + ss << "Error converting from Python objects to " << type_->ToString() << ": "; + RETURN_NOT_OK(InvalidConversion(obj, "bool", &ss)); + return Status::Invalid(ss.str()); + } + } + + out_arrays_.push_back( + std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count)); + return Status::OK(); +} + +Status PandasConverter::ConvertObjects() { + // Python object arrays are annoying, since we could have one of: + // + // * Strings + // * Booleans with nulls + // * decimal.Decimals + // * Mixed type (not supported at the moment by arrow format) + // + // Additionally, nulls may be encoded either as np.nan or None. So we have to + // do some type inference and conversion + + RETURN_NOT_OK(InitNullBitmap()); + + Ndarray1DIndexer<PyObject*> objects; + + PyAcquireGIL lock; + objects.Init(arr_); + PyDateTime_IMPORT; + lock.release(); + + // This means we received an explicit type from the user + if (type_) { + switch (type_->id()) { + case Type::STRING: + return ConvertObjectStrings(); + case Type::FIXED_SIZE_BINARY: + return ConvertObjectFixedWidthBytes(type_); + case Type::BOOL: + return ConvertBooleans(); + case Type::DATE32: + return ConvertDates<Date32Type>(); + case Type::DATE64: + return ConvertDates<Date64Type>(); + case Type::LIST: { + const auto& list_field = static_cast<const ListType&>(*type_); + return ConvertLists(list_field.value_field()->type()); + } + case Type::DECIMAL: + return ConvertDecimals(); + default: + return Status::TypeError("No known conversion to Arrow type"); + } + } else { + // Re-acquire GIL + lock.acquire(); + + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(ImportModule("decimal", &decimal)); + RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + + for (int64_t i = 0; i < length_; ++i) { + PyObject* obj = objects[i]; + if (PandasObjectIsNull(obj)) { + continue; + } else if (PyObject_is_string(obj)) { + return ConvertObjectStrings(); + } else if (PyObject_is_float(obj)) { + return ConvertObjectFloats(); + } else if (PyBool_Check(obj)) { + return ConvertBooleans(); + } else if (PyObject_is_integer(obj)) { + return ConvertObjectIntegers(); + } else if (PyDate_CheckExact(obj)) { + // We could choose Date32 or Date64 + return ConvertDates<Date32Type>(); + } else if (PyTime_Check(obj)) { + return ConvertTimes(); + } else if (PyObject_IsInstance(const_cast<PyObject*>(obj), Decimal.obj())) { + return ConvertDecimals(); + } else if (PyList_Check(obj) || PyArray_Check(obj)) { + std::shared_ptr<DataType> inferred_type; + RETURN_NOT_OK(InferArrowType(obj, &inferred_type)); + return ConvertLists(inferred_type); + } else { + const std::string supported_types = + "string, bool, float, int, date, time, decimal, list, array"; + std::stringstream ss; + ss << "Error inferring Arrow type for Python object array. "; + RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss)); + return Status::Invalid(ss.str()); + } + } + } + + out_arrays_.push_back(std::make_shared<NullArray>(length_)); + return Status::OK(); +} + +template <typename T> +Status LoopPySequence(PyObject* sequence, T func) { + if (PySequence_Check(sequence)) { + OwnedRef ref; + Py_ssize_t size = PySequence_Size(sequence); + if (PyArray_Check(sequence)) { + auto array = reinterpret_cast<PyArrayObject*>(sequence); + PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(array)); + for (int64_t i = 0; i < size; ++i) { + RETURN_NOT_OK(func(objects[i])); + } + } else { + for (int64_t i = 0; i < size; ++i) { + ref.reset(PySequence_GetItem(sequence, i)); + RETURN_NOT_OK(func(ref.obj())); + } + } + } else if (PyObject_HasAttrString(sequence, "__iter__")) { + OwnedRef iter = OwnedRef(PyObject_GetIter(sequence)); + PyObject* item; + while ((item = PyIter_Next(iter.obj()))) { + OwnedRef ref = OwnedRef(item); + RETURN_NOT_OK(func(ref.obj())); + } + } else { + return Status::TypeError("Object is not a sequence or iterable"); + } + + return Status::OK(); +} + +template <int ITEM_TYPE, typename ArrowType> +inline Status PandasConverter::ConvertTypedLists( + const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) { + typedef npy_traits<ITEM_TYPE> traits; + typedef typename traits::value_type T; + typedef typename traits::BuilderClass BuilderT; + + PyAcquireGIL lock; + + // TODO: mask not supported here + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } + + if (is_strided()) { + return Status::NotImplemented("strided arrays not implemented for lists"); + } + + BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder()); + + auto foreach_item = [&](PyObject* object) { + if (PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else if (PyArray_Check(object)) { + auto numpy_array = reinterpret_cast<PyArrayObject*>(object); + RETURN_NOT_OK(builder->Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE)); + + int64_t size = PyArray_DIM(numpy_array, 0); + auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array)); + if (traits::supports_nulls) { + RETURN_NOT_OK(null_bitmap_->Resize(size, false)); + // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't + // currently support this. + // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data()); + ValuesToValidBytes<ITEM_TYPE>(data, size, null_bitmap_->mutable_data()); + return value_builder->Append(data, size, null_bitmap_->data()); + } else { + return value_builder->Append(data, size); + } + } else if (PyList_Check(object)) { + int64_t size; + std::shared_ptr<DataType> inferred_type; + RETURN_NOT_OK(builder->Append(true)); + RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); + if (inferred_type->id() != type->id()) { + std::stringstream ss; + ss << inferred_type->ToString() << " cannot be converted to " << type->ToString(); + return Status::TypeError(ss.str()); + } + return AppendPySequence(object, size, type, value_builder); + } else { + return Status::TypeError("Unsupported Python type for list items"); + } + }; + + return LoopPySequence(list, foreach_item); +} + +template <> +inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>( + const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) { + PyAcquireGIL lock; + // TODO: If there are bytes involed, convert to Binary representation + bool have_bytes = false; + + // TODO: mask not supported here + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } + + if (is_strided()) { + return Status::NotImplemented("strided arrays not implemented for lists"); + } + + auto value_builder = static_cast<StringBuilder*>(builder->value_builder()); + + auto foreach_item = [&](PyObject* object) { + if (PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else if (PyArray_Check(object)) { + auto numpy_array = reinterpret_cast<PyArrayObject*>(object); + RETURN_NOT_OK(builder->Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + + int64_t offset = 0; + RETURN_NOT_OK(AppendObjectStrings( + numpy_array, nullptr, 0, value_builder, &offset, &have_bytes)); + if (offset < PyArray_SIZE(numpy_array)) { + return Status::Invalid("Array cell value exceeded 2GB"); + } + return Status::OK(); + } else if (PyList_Check(object)) { + int64_t size; + std::shared_ptr<DataType> inferred_type; + RETURN_NOT_OK(builder->Append(true)); + RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); + if (inferred_type->id() != Type::STRING) { + std::stringstream ss; + ss << inferred_type->ToString() << " cannot be converted to STRING."; + return Status::TypeError(ss.str()); + } + return AppendPySequence(object, size, inferred_type, value_builder); + } else { + return Status::TypeError("Unsupported Python type for list items"); + } + }; + + return LoopPySequence(list, foreach_item); +} + +#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ + case Type::TYPE: { \ + return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type, builder, list); \ + } + +Status PandasConverter::ConvertLists( + const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) { + switch (type->id()) { + LIST_CASE(UINT8, NPY_UINT8, UInt8Type) + LIST_CASE(INT8, NPY_INT8, Int8Type) + LIST_CASE(UINT16, NPY_UINT16, UInt16Type) + LIST_CASE(INT16, NPY_INT16, Int16Type) + LIST_CASE(UINT32, NPY_UINT32, UInt32Type) + LIST_CASE(INT32, NPY_INT32, Int32Type) + LIST_CASE(UINT64, NPY_UINT64, UInt64Type) + LIST_CASE(INT64, NPY_INT64, Int64Type) + LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) + LIST_CASE(FLOAT, NPY_FLOAT, FloatType) + LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) + LIST_CASE(STRING, NPY_OBJECT, StringType) + case Type::LIST: { + const ListType& list_type = static_cast<const ListType&>(*type); + auto value_builder = static_cast<ListBuilder*>(builder->value_builder()); + + auto foreach_item = [&](PyObject* object) { + if (PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else { + RETURN_NOT_OK(builder->Append(true)); + return ConvertLists(list_type.value_type(), value_builder, object); + } + }; + + return LoopPySequence(list, foreach_item); + } + default: { + std::stringstream ss; + ss << "Unknown list item type: "; + ss << type->ToString(); + return Status::TypeError(ss.str()); + } + } +} + +Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type) { + std::unique_ptr<ArrayBuilder> array_builder; + RETURN_NOT_OK(MakeBuilder(pool_, arrow::list(type), &array_builder)); + ListBuilder* list_builder = static_cast<ListBuilder*>(array_builder.get()); + RETURN_NOT_OK(ConvertLists(type, list_builder, reinterpret_cast<PyObject*>(arr_))); + return PushBuilderResult(list_builder); +} + +Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) { + PandasConverter converter(pool, ao, mo, type); + RETURN_NOT_OK(converter.Convert()); + *out = converter.result()[0]; + return Status::OK(); +} + +Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr<DataType>& type, std::shared_ptr<ChunkedArray>* out) { + PandasConverter converter(pool, ao, mo, type); + RETURN_NOT_OK(converter.ConvertObjects()); + *out = std::make_shared<ChunkedArray>(converter.result()); + return Status::OK(); +} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/pandas_to_arrow.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_to_arrow.h b/cpp/src/arrow/python/pandas_to_arrow.h new file mode 100644 index 0000000..8f18624 --- /dev/null +++ b/cpp/src/arrow/python/pandas_to_arrow.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Converting from pandas memory representation to Arrow data structures + +#ifndef ARROW_PYTHON_PANDAS_TO_ARROW_H +#define ARROW_PYTHON_PANDAS_TO_ARROW_H + +#include "arrow/python/platform.h" + +#include <memory> + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class DataType; +class MemoryPool; +class Status; + +namespace py { + +ARROW_EXPORT +Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out); + +/// Convert dtype=object arrays. If target data type is not known, pass a type +/// with nullptr +/// +/// \param[in] pool Memory pool for any memory allocations +/// \param[in] ao an ndarray with the array data +/// \param[in] mo an ndarray with a null mask (True is null), optional +/// \param[in] type +/// \param[out] out a ChunkedArray, to accommodate chunked output +ARROW_EXPORT +Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr<DataType>& type, std::shared_ptr<ChunkedArray>* out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_PANDAS_TO_ARROW_H http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/python-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 592a5e6..c0e555d 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -26,10 +26,10 @@ #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/python/arrow_to_pandas.h" #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" #include "arrow/python/helpers.h" -#include "arrow/python/pandas_convert.h" #include "arrow/util/decimal.h" http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/table.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 3ec1df9..7ada0e9 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -23,6 +23,7 @@ #include <string> #include <vector> +#include "arrow/array.h" #include "arrow/type.h" #include "arrow/util/visibility.h" @@ -58,6 +59,8 @@ class ARROW_EXPORT ChunkedArray { const ArrayVector& chunks() const { return chunks_; } + std::shared_ptr<DataType> type() const { return chunks_[0]->type(); } + bool Equals(const ChunkedArray& other) const; bool Equals(const std::shared_ptr<ChunkedArray>& other) const; http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/array.pxi ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c7a4415..efbe36f 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -154,10 +154,12 @@ cdef class Array: Returns ------- - pyarrow.array.Array + array : pyarrow.Array or pyarrow.ChunkedArray (if object data + overflowed binary storage) """ cdef: shared_ptr[CArray] out + shared_ptr[CChunkedArray] chunked_out shared_ptr[CDataType] c_type CMemoryPool* pool @@ -178,7 +180,12 @@ cdef class Array: c_type = type.sp_type with nogil: check_status(PandasObjectsToArrow( - pool, values, mask, c_type, &out)) + pool, values, mask, c_type, &chunked_out)) + + if chunked_out.get().num_chunks() > 1: + return pyarrow_wrap_chunked_array(chunked_out) + else: + out = chunked_out.get().chunk(0) else: values, type = maybe_coerce_datetime64( values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e1fe0c0..edf50ad 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -315,6 +315,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int64_t null_count() int num_chunks() shared_ptr[CArray] chunk(int i) + shared_ptr[CDataType] type() cdef cppclass CColumn" arrow::Column": CColumn(const shared_ptr[CField]& field, @@ -711,7 +712,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) + shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, shared_ptr[CTensor]* out); http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 64cf330..fea7397 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -764,16 +764,18 @@ def write_table(table, where, row_group_size=None, version='1.0', Specify the compression codec, either on a general basis or per-column. """ row_group_size = kwargs.get('chunk_size', row_group_size) - writer = ParquetWriter(where, table.schema, - use_dictionary=use_dictionary, - compression=compression, - version=version, - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) + options = dict( + use_dictionary=use_dictionary, + compression=compression, + version=version, + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) + writer = ParquetWriter(where, table.schema, **options) writer.write_table(table, row_group_size=row_group_size) writer.close() -def write_metadata(schema, where, version='1.0', use_deprecated_int96_timestamps=False): +def write_metadata(schema, where, version='1.0', + use_deprecated_int96_timestamps=False): """ Write metadata-only Parquet file from schema @@ -784,6 +786,9 @@ def write_metadata(schema, where, version='1.0', use_deprecated_int96_timestamps version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 """ - writer = ParquetWriter(where, schema, version=version, - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) + options = dict( + version=version, + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps + ) + writer = ParquetWriter(where, schema, **options) writer.close() http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/public-api.pxi ---------------------------------------------------------------------- diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 8c2083d..28e07ff 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -148,6 +148,21 @@ cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): return arr +cdef public api object pyarrow_wrap_chunked_array( + const shared_ptr[CChunkedArray]& sp_array): + if sp_array.get() == NULL: + raise ValueError('ChunkedArray was NULL') + + cdef CDataType* data_type = sp_array.get().type().get() + + if data_type == NULL: + raise ValueError('ChunkedArray data type was NULL') + + cdef ChunkedArray arr = ChunkedArray() + arr.init(sp_array) + return arr + + cdef public api bint pyarrow_is_tensor(object tensor): return isinstance(tensor, Tensor) http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/table.pxi ---------------------------------------------------------------------- diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index d7a6060..6188e90 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -44,6 +44,11 @@ cdef class ChunkedArray: self.sp_chunked_array = chunked_array self.chunked_array = chunked_array.get() + property type: + + def __get__(self): + return pyarrow_wrap_data_type(self.sp_chunked_array.get().type()) + cdef int _check_nullptr(self) except -1: if self.chunked_array == NULL: raise ReferenceError( @@ -277,7 +282,6 @@ cdef shared_ptr[const CKeyValueMetadata] unbox_metadata(dict metadata): cdef int _schema_from_arrays( arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1: cdef: - Array arr Column col c_string c_name vector[shared_ptr[CField]] fields @@ -289,23 +293,25 @@ cdef int _schema_from_arrays( if len(arrays) == 0: raise ValueError('Must pass at least one array') - if isinstance(arrays[0], Array): - if names is None: - raise ValueError('Must pass names when constructing ' - 'from Array objects') - for i in range(K): - arr = arrays[i] - type_ = arr.type.sp_type - c_name = tobytes(names[i]) - fields[i].reset(new CField(c_name, type_, True)) - elif isinstance(arrays[0], Column): + if isinstance(arrays[0], Column): for i in range(K): col = arrays[i] type_ = col.sp_column.get().type() c_name = tobytes(col.name) fields[i].reset(new CField(c_name, type_, True)) else: - raise TypeError(type(arrays[0])) + if names is None: + raise ValueError('Must pass names when constructing ' + 'from Array objects') + for i in range(K): + val = arrays[i] + if isinstance(val, (Array, ChunkedArray)): + type_ = (<DataType> val.type).sp_type + else: + raise TypeError(type(val)) + + c_name = tobytes(names[i]) + fields[i].reset(new CField(c_name, type_, True)) schema.reset(new CSchema(fields, unbox_metadata(metadata))) return 0 @@ -323,7 +329,6 @@ cdef tuple _dataframe_to_arrays( list index_columns = [] list types = [] DataType type = None - Array array dict metadata Py_ssize_t i Py_ssize_t n @@ -782,6 +787,13 @@ cdef class Table: (<Array> arrays[i]).sp_array ) ) + elif isinstance(arrays[i], ChunkedArray): + columns.push_back( + make_shared[CColumn]( + schema.get().field(i), + (<ChunkedArray> arrays[i]).sp_chunked_array + ) + ) elif isinstance(arrays[i], Column): columns.push_back((<Column> arrays[i]).sp_column) else: http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/tests/conftest.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index da94da9..2aeeab7 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -18,11 +18,12 @@ from pytest import skip -groups = ['hdfs', 'parquet'] +groups = ['hdfs', 'parquet', 'large_memory'] defaults = { 'hdfs': False, - 'parquet': False + 'parquet': False, + 'large_memory': False } try: http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 5b84817..43e0bad 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -284,6 +284,20 @@ class TestPandasConversion(unittest.TestCase): expected = pd.DataFrame({'strings': values2}) self._check_pandas_roundtrip(df, expected) + @pytest.mark.large_memory + def test_bytes_exceed_2gb(self): + val = 'x' * (1 << 20) + df = pd.DataFrame({ + 'strings': np.array([val] * 4000, dtype=object) + }) + arr = pa.Array.from_pandas(df['strings']) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + arr = None + + table = pa.Table.from_pandas(df) + assert table[0].data.num_chunks == 2 + def test_fixed_size_bytes(self): values = [b'foo', None, b'bar', None, None, b'hey'] df = pd.DataFrame({'strings': values})