Repository: arrow Updated Branches: refs/heads/master 754bcce68 -> 137aade40
ARROW-722: [Python] Support additional date/time types and metadata, conversion to/from NumPy and pandas.DataFrame Would appreciate a close look from @xhochy, @cpcloud. Also did some inline visitor cleaning for nicer code reuse Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #510 from wesm/ARROW-722 and squashes the following commits: 3e1fda3 [Wes McKinney] cpplint cb32a6b [Wes McKinney] Nicer error message. Run clang-format 854f470 [Wes McKinney] First cut refactor 06dce15 [Wes McKinney] Rebase conflicts d1dc342 [Wes McKinney] Bring Python bindings to date/time types up to spec. Handle zero-copy creation from same-size int32/64. Use inline visitor in PandasConverter Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/137aade4 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/137aade4 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/137aade4 Branch: refs/heads/master Commit: 137aade404cf53a7dbe0eaa31a868d1c376654b3 Parents: 754bcce Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Sun Apr 9 20:00:30 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Sun Apr 9 20:00:30 2017 -0400 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 6 + cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/python/builtin_convert.cc | 29 +- cpp/src/arrow/python/builtin_convert.h | 4 + cpp/src/arrow/python/pandas_convert.cc | 345 +++++++++++------------ cpp/src/arrow/python/pandas_convert.h | 3 - cpp/src/arrow/python/type_traits.h | 63 +++-- cpp/src/arrow/python/util/datetime.h | 6 +- cpp/src/arrow/table-test.cc | 55 +--- cpp/src/arrow/table.cc | 10 +- cpp/src/arrow/table.h | 4 +- cpp/src/arrow/type.cc | 4 +- cpp/src/arrow/type.h | 4 +- cpp/src/arrow/util/stl.h | 4 +- python/pyarrow/scalar.pyx | 6 +- python/pyarrow/tests/test_convert_pandas.py | 182 +++++++----- 16 files changed, 399 insertions(+), 327 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5852fe5..b29cb7b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -386,6 +386,12 @@ enable_testing() set(Boost_DEBUG TRUE) set(Boost_USE_MULTITHREADED ON) +set(Boost_ADDITIONAL_VERSIONS + "1.63.0" "1.63" + "1.62.0" "1.61" + "1.61.0" "1.62" + "1.60.0" "1.60") + if (ARROW_BOOST_USE_SHARED) # Find shared Boost libraries. set(Boost_USE_STATIC_LIBS OFF) http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8eaa76a..cb5282c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -34,6 +34,7 @@ install(FILES type_traits.h test-util.h visitor.h + visitor_inline.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow") # pkg-config support http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/builtin_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 189ecee..a064a3d 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -43,6 +43,31 @@ static inline bool IsPyInteger(PyObject* obj) { #endif } +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { + OwnedRef type(PyObject_Type(obj)); + RETURN_IF_PYERROR(); + DCHECK_NE(type.obj(), nullptr); + + OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__")); + RETURN_IF_PYERROR(); + DCHECK_NE(type_name.obj(), nullptr); + + PyObjectStringify bytestring(type_name.obj()); + RETURN_IF_PYERROR(); + + const char* bytes = bytestring.bytes; + DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; + + Py_ssize_t size = bytestring.size; + + std::string cpp_type_name(bytes, size); + + std::stringstream ss; + ss << "Python object of type " << cpp_type_name << " is not None and is not a " + << expected_type_name << " object"; + return Status::Invalid(ss.str()); +} + class ScalarVisitor { public: ScalarVisitor() @@ -397,7 +422,7 @@ class BytesConverter : public TypedConverter<BinaryBuilder> { } else if (PyBytes_Check(item)) { bytes_obj = item; } else { - return Status::Invalid("Value that cannot be converted to bytes was encountered"); + return InvalidConversion(item, "bytes"); } // No error checking length = PyBytes_GET_SIZE(bytes_obj); @@ -431,7 +456,7 @@ class FixedWidthBytesConverter : public TypedConverter<FixedSizeBinaryBuilder> { } else if (PyBytes_Check(item)) { bytes_obj = item; } else { - return Status::Invalid("Value that cannot be converted to bytes was encountered"); + return InvalidConversion(item, "bytes"); } // No error checking RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length)); http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/builtin_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h index 3c2e350..2141c25 100644 --- a/cpp/src/arrow/python/builtin_convert.h +++ b/cpp/src/arrow/python/builtin_convert.h @@ -24,6 +24,7 @@ #include <Python.h> #include <memory> +#include <string> #include "arrow/type.h" @@ -60,6 +61,9 @@ ARROW_EXPORT Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out, const std::shared_ptr<DataType>& type, int64_t size); +ARROW_EXPORT +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name); + ARROW_EXPORT Status CheckPythonBytesAreFixedLength( PyObject* obj, Py_ssize_t expected_length); http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index f6e627e..5bb8e45 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -44,6 +44,7 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/visitor_inline.h" #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" @@ -271,7 +272,7 @@ static inline bool ListTypeSupported(const Type::type type_id) { // ---------------------------------------------------------------------- // Conversion from NumPy-in-Pandas to Arrow -class PandasConverter : public TypeVisitor { +class PandasConverter { public: PandasConverter( MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr<DataType>& type) @@ -332,23 +333,37 @@ class PandasConverter : public TypeVisitor { return LoadArray(type_, fields, {null_bitmap_, data}, &out_); } -#define VISIT_NATIVE(TYPE) \ - Status Visit(const TYPE& type) override { return VisitNative<TYPE>(); } + template <typename T> + typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value || + std::is_same<BooleanType, T>::value, + Status>::type + Visit(const T& type) { + return VisitNative<T>(); + } + + Status Visit(const Date32Type& type) { return VisitNative<Int32Type>(); } + Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); } + Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); } + Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); } + Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); } + + Status Visit(const NullType& type) { return Status::NotImplemented("null"); } + + Status Visit(const BinaryType& type) { return Status::NotImplemented(type.ToString()); } + + Status Visit(const FixedSizeBinaryType& type) { + return Status::NotImplemented(type.ToString()); + } - VISIT_NATIVE(BooleanType); - VISIT_NATIVE(Int8Type); - VISIT_NATIVE(Int16Type); - VISIT_NATIVE(Int32Type); - VISIT_NATIVE(Int64Type); - VISIT_NATIVE(UInt8Type); - VISIT_NATIVE(UInt16Type); - VISIT_NATIVE(UInt32Type); - VISIT_NATIVE(UInt64Type); - VISIT_NATIVE(FloatType); - VISIT_NATIVE(DoubleType); - VISIT_NATIVE(TimestampType); + Status Visit(const DecimalType& type) { + return Status::NotImplemented(type.ToString()); + } -#undef VISIT_NATIVE + Status Visit(const DictionaryType& type) { + return Status::NotImplemented(type.ToString()); + } + + Status Visit(const NestedType& type) { return Status::NotImplemented(type.ToString()); } Status Convert() { if (PyArray_NDIM(arr_) != 1) { @@ -358,9 +373,7 @@ class PandasConverter : public TypeVisitor { if (type_ == nullptr) { return Status::Invalid("Must pass data type"); } // Visit the type to perform conversion - RETURN_NOT_OK(type_->Accept(this)); - - return Status::OK(); + return VisitTypeInline(*type_, this); } std::shared_ptr<Array> result() const { return out_; } @@ -371,10 +384,12 @@ class PandasConverter : public TypeVisitor { template <int ITEM_TYPE, typename ArrowType> Status ConvertTypedLists(const std::shared_ptr<DataType>& type); + template <typename ArrowType> + Status ConvertDates(); + Status ConvertObjectStrings(); Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type); Status ConvertBooleans(); - Status ConvertDates(); Status ConvertLists(const std::shared_ptr<DataType>& type); Status ConvertObjects(); Status ConvertDecimals(); @@ -462,41 +477,36 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* return Status::OK(); } -Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { - OwnedRef type(PyObject_Type(obj)); - RETURN_IF_PYERROR(); - DCHECK_NE(type.obj(), nullptr); - - OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__")); - RETURN_IF_PYERROR(); - DCHECK_NE(type_name.obj(), nullptr); - - PyObjectStringify bytestring(type_name.obj()); - RETURN_IF_PYERROR(); - - const char* bytes = bytestring.bytes; - DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; - - Py_ssize_t size = bytestring.size; +template <typename T> +struct UnboxDate {}; - std::string cpp_type_name(bytes, size); +template <> +struct UnboxDate<Date32Type> { + static int64_t Unbox(PyObject* obj) { + return PyDate_to_days(reinterpret_cast<PyDateTime_Date*>(obj)); + } +}; - std::stringstream ss; - ss << "Python object of type " << cpp_type_name << " is not None and is not a " - << expected_type_name << " object"; - return Status::Invalid(ss.str()); -} +template <> +struct UnboxDate<Date64Type> { + static int64_t Unbox(PyObject* obj) { + return PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(obj)); + } +}; +template <typename ArrowType> Status PandasConverter::ConvertDates() { PyAcquireGIL lock; + using BuilderType = typename TypeTraits<ArrowType>::BuilderType; + Ndarray1DIndexer<PyObject*> objects(arr_); if (mask_ != nullptr) { return Status::NotImplemented("mask not supported in object conversions yet"); } - Date64Builder date_builder(pool_); + BuilderType date_builder(pool_); RETURN_NOT_OK(date_builder.Resize(length_)); /// We have to run this in this compilation unit, since we cannot use the @@ -508,8 +518,7 @@ Status PandasConverter::ConvertDates() { for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; if (PyDate_CheckExact(obj)) { - PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj); - date_builder.Append(PyDate_to_ms(pydate)); + date_builder.Append(UnboxDate<ArrowType>::Unbox(obj)); } else if (PyObject_is_null(obj)) { date_builder.AppendNull(); } else { @@ -762,8 +771,10 @@ Status PandasConverter::ConvertObjects() { return ConvertObjectFixedWidthBytes(type_); case Type::BOOL: return ConvertBooleans(); + case Type::DATE32: + return ConvertDates<Date32Type>(); case Type::DATE64: - return ConvertDates(); + return ConvertDates<Date64Type>(); case Type::LIST: { const auto& list_field = static_cast<const ListType&>(*type_); return ConvertLists(list_field.value_field()->type); @@ -787,7 +798,8 @@ Status PandasConverter::ConvertObjects() { } else if (PyBool_Check(objects[i])) { return ConvertBooleans(); } else if (PyDate_CheckExact(objects[i])) { - return ConvertDates(); + // We could choose Date32 or Date64 + return ConvertDates<Date32Type>(); } else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), Decimal.obj())) { return ConvertDecimals(); } else { @@ -955,34 +967,6 @@ Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, // ---------------------------------------------------------------------- // pandas 0.x DataFrame conversion internals -inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { - if (type == NPY_DATETIME) { - PyArray_Descr* descr = PyArray_DESCR(out); - auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata); - if (datatype->type == Type::TIMESTAMP) { - auto timestamp_type = static_cast<TimestampType*>(datatype); - - switch (timestamp_type->unit) { - case TimestampType::Unit::SECOND: - date_dtype->meta.base = NPY_FR_s; - break; - case TimestampType::Unit::MILLI: - date_dtype->meta.base = NPY_FR_ms; - break; - case TimestampType::Unit::MICRO: - date_dtype->meta.base = NPY_FR_us; - break; - case TimestampType::Unit::NANO: - date_dtype->meta.base = NPY_FR_ns; - break; - } - } else { - // datatype->type == Type::DATE64 - date_dtype->meta.base = NPY_FR_D; - } - } -} - class PandasBlock { public: enum type { @@ -1148,8 +1132,9 @@ static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) } } -template <typename ArrayType> +template <typename Type> inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) { + using ArrayType = typename TypeTraits<Type>::ArrayType; PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { auto arr = static_cast<ArrayType*>(data.chunk(c).get()); @@ -1287,21 +1272,7 @@ inline void ConvertNumericNullableCast( } } -template <typename T> -inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr<Array> arr = data.chunk(c); - auto prim_arr = static_cast<PrimitiveArray*>(arr.get()); - auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data()); - - for (int64_t i = 0; i < arr->length(); ++i) { - // There are 1000 * 60 * 60 * 24 = 86400000ms in a day - *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000; - } - } -} - -template <typename InType, int SHIFT> +template <typename InType, int64_t SHIFT> inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr<Array> arr = data.chunk(c); @@ -1339,9 +1310,9 @@ class ObjectBlock : public PandasBlock { if (type == Type::BOOL) { RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer)); } else if (type == Type::BINARY) { - RETURN_NOT_OK(ConvertBinaryLike<BinaryArray>(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike<BinaryType>(data, out_buffer)); } else if (type == Type::STRING) { - RETURN_NOT_OK(ConvertBinaryLike<StringArray>(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike<StringType>(data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); } else if (type == Type::DECIMAL) { @@ -1532,7 +1503,11 @@ class DatetimeBlock : public PandasBlock { const ChunkedArray& data = *col.get()->data(); - if (type == Type::DATE64) { + if (type == Type::DATE32) { + // Date64Type is millisecond timestamp stored as int64_t + // TODO(wesm): Do we want to make sure to zero out the milliseconds? + ConvertDatetimeNanos<int32_t, kNanosecondsInDay>(data, out_buffer); + } else if (type == Type::DATE64) { // Date64Type is millisecond timestamp stored as int64_t // TODO(wesm): Do we want to make sure to zero out the milliseconds? ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer); @@ -1779,6 +1754,9 @@ class DataFrameBlockCreator { case Type::FIXED_SIZE_BINARY: output_type = PandasBlock::OBJECT; break; + case Type::DATE32: + output_type = PandasBlock::DATETIME; + break; case Type::DATE64: output_type = PandasBlock::DATETIME; break; @@ -1960,6 +1938,34 @@ class DataFrameBlockCreator { BlockMap datetimetz_blocks_; }; +inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { + if (type == NPY_DATETIME) { + PyArray_Descr* descr = PyArray_DESCR(out); + auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata); + if (datatype->type == Type::TIMESTAMP) { + auto timestamp_type = static_cast<TimestampType*>(datatype); + + switch (timestamp_type->unit) { + case TimestampType::Unit::SECOND: + date_dtype->meta.base = NPY_FR_s; + break; + case TimestampType::Unit::MILLI: + date_dtype->meta.base = NPY_FR_ms; + break; + case TimestampType::Unit::MICRO: + date_dtype->meta.base = NPY_FR_us; + break; + case TimestampType::Unit::NANO: + date_dtype->meta.base = NPY_FR_ns; + break; + } + } else { + // datatype->type == Type::DATE64 + date_dtype->meta.base = NPY_FR_D; + } + } +} + class ArrowDeserializer { public: ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref) @@ -2024,51 +2030,14 @@ class ArrowDeserializer { // Allocate new array and deserialize. Can do a zero copy conversion for some // types - Status Convert(PyObject** out) { -#define CONVERT_CASE(TYPE) \ - case Type::TYPE: { \ - RETURN_NOT_OK(ConvertValues<Type::TYPE>()); \ - } break; - - switch (col_->type()->type) { - CONVERT_CASE(BOOL); - CONVERT_CASE(INT8); - CONVERT_CASE(INT16); - CONVERT_CASE(INT32); - CONVERT_CASE(INT64); - CONVERT_CASE(UINT8); - CONVERT_CASE(UINT16); - CONVERT_CASE(UINT32); - CONVERT_CASE(UINT64); - CONVERT_CASE(FLOAT); - CONVERT_CASE(DOUBLE); - CONVERT_CASE(BINARY); - CONVERT_CASE(STRING); - CONVERT_CASE(FIXED_SIZE_BINARY); - CONVERT_CASE(DATE64); - CONVERT_CASE(TIMESTAMP); - CONVERT_CASE(DICTIONARY); - CONVERT_CASE(LIST); - CONVERT_CASE(DECIMAL); - default: { - std::stringstream ss; - ss << "Arrow type reading not implemented for " << col_->type()->ToString(); - return Status::NotImplemented(ss.str()); - } - } - -#undef CONVERT_CASE - - *out = result_; - return Status::OK(); - } + template <typename Type> + typename std::enable_if<std::is_base_of<FloatingPoint, Type>::value, Status>::type + Visit(const Type& type) { + constexpr int TYPE = Type::type_id; + using traits = arrow_traits<TYPE>; - template <int TYPE> - inline typename std::enable_if< - (TYPE != Type::DATE64) & arrow_traits<TYPE>::is_numeric_nullable, Status>::type - ConvertValues() { - typedef typename arrow_traits<TYPE>::T T; - int npy_type = arrow_traits<TYPE>::npy_type; + typedef typename traits::T T; + int npy_type = traits::npy_type; if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0)); @@ -2076,31 +2045,56 @@ class ArrowDeserializer { RETURN_NOT_OK(AllocateOutput(npy_type)); auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_)); - ConvertNumericNullable<T>(data_, arrow_traits<TYPE>::na_value, out_values); + ConvertNumericNullable<T>(data_, traits::na_value, out_values); return Status::OK(); } - template <int TYPE> - inline typename std::enable_if<TYPE == Type::DATE64, Status>::type ConvertValues() { - typedef typename arrow_traits<TYPE>::T T; + template <typename Type> + typename std::enable_if<std::is_base_of<DateType, Type>::value || + std::is_base_of<TimestampType, Type>::value, + Status>::type + Visit(const Type& type) { + constexpr int TYPE = Type::type_id; + using traits = arrow_traits<TYPE>; + + typedef typename traits::T T; - RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type)); + RETURN_NOT_OK(AllocateOutput(traits::npy_type)); auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_)); - ConvertDates<T>(data_, arrow_traits<TYPE>::na_value, out_values); + + constexpr T na_value = traits::na_value; + constexpr int64_t kShift = traits::npy_shift; + + for (int c = 0; c < data_.num_chunks(); c++) { + const std::shared_ptr<Array> arr = data_.chunk(c); + auto prim_arr = static_cast<PrimitiveArray*>(arr.get()); + auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / kShift; + } + } return Status::OK(); } + template <typename Type> + typename std::enable_if<std::is_base_of<TimeType, Type>::value, Status>::type Visit( + const Type& type) { + return Status::NotImplemented("Don't know how to serialize Arrow time type to NumPy"); + } + // Integer specialization - template <int TYPE> - inline - typename std::enable_if<arrow_traits<TYPE>::is_numeric_not_nullable, Status>::type - ConvertValues() { - typedef typename arrow_traits<TYPE>::T T; - int npy_type = arrow_traits<TYPE>::npy_type; + template <typename Type> + typename std::enable_if<std::is_base_of<Integer, Type>::value, Status>::type Visit( + const Type& type) { + constexpr int TYPE = Type::type_id; + using traits = arrow_traits<TYPE>; + + typedef typename traits::T T; if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { - return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0)); + return ConvertValuesZeroCopy<TYPE>(traits::npy_type, data_.chunk(0)); } if (data_.null_count() > 0) { @@ -2108,7 +2102,7 @@ class ArrowDeserializer { auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_)); ConvertIntegerWithNulls<T>(data_, out_values); } else { - RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type)); + RETURN_NOT_OK(AllocateOutput(traits::npy_type)); auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_)); ConvertIntegerNoNullsSameType<T>(data_, out_values); } @@ -2117,15 +2111,13 @@ class ArrowDeserializer { } // Boolean specialization - template <int TYPE> - inline typename std::enable_if<arrow_traits<TYPE>::is_boolean, Status>::type - ConvertValues() { + Status Visit(const BooleanType& type) { if (data_.null_count() > 0) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values)); } else { - RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type)); + RETURN_NOT_OK(AllocateOutput(arrow_traits<Type::BOOL>::npy_type)); auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(arr_)); ConvertBooleanNoNulls(data_, out_values); } @@ -2133,43 +2125,32 @@ class ArrowDeserializer { } // UTF8 strings - template <int TYPE> - inline typename std::enable_if<TYPE == Type::STRING, Status>::type ConvertValues() { + template <typename Type> + typename std::enable_if<std::is_base_of<BinaryType, Type>::value, Status>::type Visit( + const Type& type) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); - return ConvertBinaryLike<StringArray>(data_, out_values); - } - - // Binary strings - template <int T2> - inline typename std::enable_if<T2 == Type::BINARY, Status>::type ConvertValues() { - RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); - auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); - return ConvertBinaryLike<BinaryArray>(data_, out_values); + return ConvertBinaryLike<Type>(data_, out_values); } // Fixed length binary strings - template <int TYPE> - inline typename std::enable_if<TYPE == Type::FIXED_SIZE_BINARY, Status>::type - ConvertValues() { + Status Visit(const FixedSizeBinaryType& type) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); return ConvertFixedSizeBinary(data_, out_values); } - template <int TYPE> - inline typename std::enable_if<TYPE == Type::DECIMAL, Status>::type ConvertValues() { + Status Visit(const DecimalType& type) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); return ConvertDecimals(data_, out_values); } + Status Visit(const ListType& type) { #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ return ConvertListsLike<ArrowType>(col_, out_values); - template <int T2> - inline typename std::enable_if<T2 == Type::LIST, Status>::type ConvertValues() { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); auto list_type = std::static_pointer_cast<ListType>(col_->type()); @@ -2193,10 +2174,10 @@ class ArrowDeserializer { return Status::NotImplemented(ss.str()); } } +#undef CONVERTVALUES_LISTSLIKE_CASE } - template <int TYPE> - inline typename std::enable_if<TYPE == Type::DICTIONARY, Status>::type ConvertValues() { + Status Visit(const DictionaryType& type) { std::shared_ptr<PandasBlock> block; RETURN_NOT_OK(MakeCategoricalBlock(col_->type(), col_->length(), &block)); RETURN_NOT_OK(block->Write(col_, 0, 0)); @@ -2216,6 +2197,18 @@ class ArrowDeserializer { return Status::OK(); } + Status Visit(const NullType& type) { return Status::NotImplemented("null type"); } + + Status Visit(const StructType& type) { return Status::NotImplemented("struct type"); } + + Status Visit(const UnionType& type) { return Status::NotImplemented("union type"); } + + Status Convert(PyObject** out) { + RETURN_NOT_OK(VisitTypeInline(*col_->type(), this)); + *out = result_; + return Status::OK(); + } + private: std::shared_ptr<Column> col_; const ChunkedArray& data_; http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/pandas_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h index 8fd3107..4d32c8b 100644 --- a/cpp/src/arrow/python/pandas_convert.h +++ b/cpp/src/arrow/python/pandas_convert.h @@ -71,9 +71,6 @@ ARROW_EXPORT Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out); -ARROW_EXPORT -Status InvalidConversion(PyObject* obj, const std::string& expected_type_name); - } // namespace py } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/type_traits.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index f78dc36..c464d65 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -119,9 +119,6 @@ template <> struct arrow_traits<Type::BOOL> { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; - static constexpr bool is_boolean = true; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = false; }; #define INT_DECL(TYPE) \ @@ -130,9 +127,6 @@ struct arrow_traits<Type::BOOL> { static constexpr int npy_type = NPY_##TYPE; \ static constexpr bool supports_nulls = false; \ static constexpr double na_value = NAN; \ - static constexpr bool is_boolean = false; \ - static constexpr bool is_numeric_not_nullable = true; \ - static constexpr bool is_numeric_nullable = false; \ typedef typename npy_traits<NPY_##TYPE>::value_type T; \ }; @@ -150,9 +144,6 @@ struct arrow_traits<Type::FLOAT> { static constexpr int npy_type = NPY_FLOAT32; static constexpr bool supports_nulls = true; static constexpr float na_value = NAN; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; typedef typename npy_traits<NPY_FLOAT32>::value_type T; }; @@ -161,33 +152,63 @@ struct arrow_traits<Type::DOUBLE> { static constexpr int npy_type = NPY_FLOAT64; static constexpr bool supports_nulls = true; static constexpr double na_value = NAN; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; typedef typename npy_traits<NPY_FLOAT64>::value_type T; }; static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min(); +constexpr int64_t kNanosecondsInDay = 86400000000000LL; + template <> struct arrow_traits<Type::TIMESTAMP> { static constexpr int npy_type = NPY_DATETIME; + static constexpr int64_t npy_shift = 1; + static constexpr bool supports_nulls = true; static constexpr int64_t na_value = kPandasTimestampNull; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; typedef typename npy_traits<NPY_DATETIME>::value_type T; }; template <> +struct arrow_traits<Type::DATE32> { + // Data stores as FR_D day unit + static constexpr int npy_type = NPY_DATETIME; + static constexpr int64_t npy_shift = 1; + + static constexpr bool supports_nulls = true; + typedef typename npy_traits<NPY_DATETIME>::value_type T; + + static constexpr int64_t na_value = kPandasTimestampNull; + static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); } +}; + +template <> struct arrow_traits<Type::DATE64> { + // Data stores as FR_D day unit static constexpr int npy_type = NPY_DATETIME; + + // There are 1000 * 60 * 60 * 24 = 86400000ms in a day + static constexpr int64_t npy_shift = 86400000; + + static constexpr bool supports_nulls = true; + typedef typename npy_traits<NPY_DATETIME>::value_type T; + + static constexpr int64_t na_value = kPandasTimestampNull; + static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); } +}; + +template <> +struct arrow_traits<Type::TIME32> { + static constexpr int npy_type = NPY_OBJECT; static constexpr bool supports_nulls = true; static constexpr int64_t na_value = kPandasTimestampNull; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; + typedef typename npy_traits<NPY_DATETIME>::value_type T; +}; + +template <> +struct arrow_traits<Type::TIME64> { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; typedef typename npy_traits<NPY_DATETIME>::value_type T; }; @@ -195,18 +216,12 @@ template <> struct arrow_traits<Type::STRING> { static constexpr int npy_type = NPY_OBJECT; static constexpr bool supports_nulls = true; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = false; }; template <> struct arrow_traits<Type::BINARY> { static constexpr int npy_type = NPY_OBJECT; static constexpr bool supports_nulls = true; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = false; }; } // namespace py http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/util/datetime.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index f704a96..82cf6fc 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -24,7 +24,7 @@ namespace arrow { namespace py { -inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { +static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { struct tm date = {0}; date.tm_year = PyDateTime_GET_YEAR(pydate) - 1900; date.tm_mon = PyDateTime_GET_MONTH(pydate) - 1; @@ -36,6 +36,10 @@ inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { return lrint(difftime(mktime(&date), mktime(&epoch)) * 1000); } +static inline int32_t PyDate_to_days(PyDateTime_Date* pydate) { + return static_cast<int32_t>(PyDate_to_ms(pydate) / 86400000LL); +} + } // namespace py } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/table-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 156c3d1..cdc0238 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -398,62 +398,35 @@ TEST_F(TestTable, AddColumn) { ASSERT_TRUE(status.IsInvalid()); // Add column with wrong length - auto longer_col = std::make_shared<Column>( - schema_->field(0), MakePrimitive<Int32Array>(length + 1)); + auto longer_col = + std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length + 1)); status = table.AddColumn(0, longer_col, &result); ASSERT_TRUE(status.IsInvalid()); // Add column 0 in different places ASSERT_OK(table.AddColumn(0, columns_[0], &result)); - auto ex_schema = std::shared_ptr<Schema>(new Schema({ - schema_->field(0), - schema_->field(0), - schema_->field(1), - schema_->field(2)})); + auto ex_schema = std::shared_ptr<Schema>(new Schema( + {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); std::vector<std::shared_ptr<Column>> ex_columns = { - table.column(0), - table.column(0), - table.column(1), - table.column(2)}; + table.column(0), table.column(0), table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(1, columns_[0], &result)); - ex_schema = std::shared_ptr<Schema>(new Schema({ - schema_->field(0), - schema_->field(0), - schema_->field(1), - schema_->field(2)})); - ex_columns = { - table.column(0), - table.column(0), - table.column(1), - table.column(2)}; + ex_schema = std::shared_ptr<Schema>(new Schema( + {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)})); + ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(2, columns_[0], &result)); - ex_schema = std::shared_ptr<Schema>(new Schema({ - schema_->field(0), - schema_->field(1), - schema_->field(0), - schema_->field(2)})); - ex_columns = { - table.column(0), - table.column(1), - table.column(0), - table.column(2)}; + ex_schema = std::shared_ptr<Schema>(new Schema( + {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)})); + ex_columns = {table.column(0), table.column(1), table.column(0), table.column(2)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); ASSERT_OK(table.AddColumn(3, columns_[0], &result)); - ex_schema = std::shared_ptr<Schema>(new Schema({ - schema_->field(0), - schema_->field(1), - schema_->field(2), - schema_->field(0)})); - ex_columns = { - table.column(0), - table.column(1), - table.column(2), - table.column(0)}; + ex_schema = std::shared_ptr<Schema>(new Schema( + {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)})); + ex_columns = {table.column(0), table.column(1), table.column(2), table.column(0)}; ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); } http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/table.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 9b39f77..4c5257b 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -321,11 +321,9 @@ Status Table::RemoveColumn(int i, std::shared_ptr<Table>* out) const { return Status::OK(); } -Status Table::AddColumn(int i, const std::shared_ptr<Column>& col, - std::shared_ptr<Table>* out) const { - if (i < 0 || i > num_columns() + 1) { - return Status::Invalid("Invalid column index."); - } +Status Table::AddColumn( + int i, const std::shared_ptr<Column>& col, std::shared_ptr<Table>* out) const { + if (i < 0 || i > num_columns() + 1) { return Status::Invalid("Invalid column index."); } if (col == nullptr) { std::stringstream ss; ss << "Column " << i << " was null"; @@ -334,7 +332,7 @@ Status Table::AddColumn(int i, const std::shared_ptr<Column>& col, if (col->length() != num_rows_) { std::stringstream ss; ss << "Added column's length must match table's length. Expected length " << num_rows_ - << " but got length " << col->length(); + << " but got length " << col->length(); return Status::Invalid(ss.str()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/table.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index dcea53d..b15d31b 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -182,8 +182,8 @@ class ARROW_EXPORT Table { Status RemoveColumn(int i, std::shared_ptr<Table>* out) const; /// Add column to the table, producing a new Table - Status AddColumn(int i, const std::shared_ptr<Column>& column, - std::shared_ptr<Table>* out) const; + Status AddColumn( + int i, const std::shared_ptr<Column>& column, std::shared_ptr<Table>* out) const; // @returns: the number of columns in the table int num_columns() const { return static_cast<int>(columns_.size()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index df4590f..93cab14 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -258,8 +258,8 @@ std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) { } } -Status Schema::AddField(int i, const std::shared_ptr<Field>& field, - std::shared_ptr<Schema>* out) const { +Status Schema::AddField( + int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const { DCHECK_GE(i, 0); DCHECK_LE(i, this->num_fields()); http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3a35f56..730cbed 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -633,8 +633,8 @@ class ARROW_EXPORT Schema { // Render a string representation of the schema suitable for debugging std::string ToString() const; - Status AddField(int i, const std::shared_ptr<Field>& field, - std::shared_ptr<Schema>* out) const; + Status AddField( + int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const; Status RemoveField(int i, std::shared_ptr<Schema>* out) const; int num_fields() const { return static_cast<int>(fields_.size()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/util/stl.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h index bd25053..bfce111 100644 --- a/cpp/src/arrow/util/stl.h +++ b/cpp/src/arrow/util/stl.h @@ -40,8 +40,8 @@ inline std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t i } template <typename T> -inline std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index, - const T& new_element) { +inline std::vector<T> AddVectorElement( + const std::vector<T>& values, size_t index, const T& new_element) { DCHECK_LE(index, values.size()); std::vector<T> out; out.reserve(values.size() + 1); http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/python/pyarrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index f3d9321..196deed 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -134,7 +134,11 @@ cdef class UInt64Value(ArrayValue): cdef class Date32Value(ArrayValue): def as_py(self): - raise NotImplementedError + cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get() + + # Shift to seconds since epoch + return datetime.datetime.utcfromtimestamp( + int(ap.Value(self.index)) * 86400).date() cdef class Date64Value(ArrayValue): http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 0504e1d..d1bea0b 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -28,7 +28,7 @@ import pandas as pd import pandas.util.testing as tm from pyarrow.compat import u -import pyarrow as A +import pyarrow as pa from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -67,7 +67,7 @@ class TestPandasConversion(unittest.TestCase): def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, timestamps_to_ms=False, expected_schema=None, check_dtype=True, schema=None): - table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms, + table = pa.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms, schema=schema) result = table.to_pandas(nthreads=nthreads) if expected_schema: @@ -78,7 +78,7 @@ class TestPandasConversion(unittest.TestCase): def _check_array_roundtrip(self, values, expected=None, mask=None, timestamps_to_ms=False, type=None): - arr = A.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms, + arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms, mask=mask, type=type) result = arr.to_pandas() @@ -99,23 +99,23 @@ class TestPandasConversion(unittest.TestCase): def test_float_no_nulls(self): data = {} fields = [] - dtypes = [('f4', A.float32()), ('f8', A.float64())] + dtypes = [('f4', pa.float32()), ('f8', pa.float64())] num_values = 100 for numpy_dtype, arrow_dtype in dtypes: values = np.random.randn(num_values) data[numpy_dtype] = values.astype(numpy_dtype) - fields.append(A.Field.from_py(numpy_dtype, arrow_dtype)) + fields.append(pa.Field.from_py(numpy_dtype, arrow_dtype)) df = pd.DataFrame(data) - schema = A.Schema.from_fields(fields) + schema = pa.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema) def test_float_nulls(self): num_values = 100 null_mask = np.random.randint(0, 10, size=num_values) < 3 - dtypes = [('f4', A.float32()), ('f8', A.float64())] + dtypes = [('f4', pa.float32()), ('f8', pa.float64())] names = ['f4', 'f8'] expected_cols = [] @@ -124,9 +124,9 @@ class TestPandasConversion(unittest.TestCase): for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) - arr = A.Array.from_numpy(values, null_mask) + arr = pa.Array.from_numpy(values, null_mask) arrays.append(arr) - fields.append(A.Field.from_py(name, arrow_dtype)) + fields.append(pa.Field.from_py(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) @@ -134,8 +134,8 @@ class TestPandasConversion(unittest.TestCase): ex_frame = pd.DataFrame(dict(zip(names, expected_cols)), columns=names) - table = A.Table.from_arrays(arrays, names) - assert table.schema.equals(A.Schema.from_fields(fields)) + table = pa.Table.from_arrays(arrays, names) + assert table.schema.equals(pa.Schema.from_fields(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) @@ -144,11 +144,11 @@ class TestPandasConversion(unittest.TestCase): fields = [] numpy_dtypes = [ - ('i1', A.int8()), ('i2', A.int16()), - ('i4', A.int32()), ('i8', A.int64()), - ('u1', A.uint8()), ('u2', A.uint16()), - ('u4', A.uint32()), ('u8', A.uint64()), - ('longlong', A.int64()), ('ulonglong', A.uint64()) + ('i1', pa.int8()), ('i2', pa.int16()), + ('i4', pa.int32()), ('i8', pa.int64()), + ('u1', pa.uint8()), ('u2', pa.uint16()), + ('u4', pa.uint32()), ('u8', pa.uint64()), + ('longlong', pa.int64()), ('ulonglong', pa.uint64()) ] num_values = 100 @@ -158,10 +158,10 @@ class TestPandasConversion(unittest.TestCase): min(info.max, np.iinfo('i8').max), size=num_values) data[dtype] = values.astype(dtype) - fields.append(A.Field.from_py(dtype, arrow_dtype)) + fields.append(pa.Field.from_py(dtype, arrow_dtype)) df = pd.DataFrame(data) - schema = A.Schema.from_fields(fields) + schema = pa.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema) def test_integer_with_nulls(self): @@ -177,7 +177,7 @@ class TestPandasConversion(unittest.TestCase): for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) - arr = A.Array.from_numpy(values, null_mask) + arr = pa.Array.from_numpy(values, null_mask) arrays.append(arr) expected = values.astype('f8') @@ -188,7 +188,7 @@ class TestPandasConversion(unittest.TestCase): ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), columns=int_dtypes) - table = A.Table.from_arrays(arrays, int_dtypes) + table = pa.Table.from_arrays(arrays, int_dtypes) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) @@ -199,8 +199,8 @@ class TestPandasConversion(unittest.TestCase): np.random.seed(0) df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) - field = A.Field.from_py('bools', A.bool_()) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('bools', pa.bool_()) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, expected_schema=schema) def test_boolean_nulls(self): @@ -211,16 +211,16 @@ class TestPandasConversion(unittest.TestCase): mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 - arr = A.Array.from_numpy(values, mask) + arr = pa.Array.from_numpy(values, mask) expected = values.astype(object) expected[mask] = None - field = A.Field.from_py('bools', A.bool_()) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('bools', pa.bool_()) + schema = pa.Schema.from_fields([field]) ex_frame = pd.DataFrame({'bools': expected}) - table = A.Table.from_arrays([arr], ['bools']) + table = pa.Table.from_arrays([arr], ['bools']) assert table.schema.equals(schema) result = table.to_pandas() @@ -229,16 +229,16 @@ class TestPandasConversion(unittest.TestCase): def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) - field = A.Field.from_py('bools', A.bool_()) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('bools', pa.bool_()) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, expected_schema=schema) def test_unicode(self): repeats = 1000 values = [u'foo', None, u'bar', u'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) - field = A.Field.from_py('strings', A.string()) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('strings', pa.string()) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, expected_schema=schema) @@ -246,8 +246,8 @@ class TestPandasConversion(unittest.TestCase): values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] df = pd.DataFrame({'strings': values}) - table = A.Table.from_pandas(df) - assert table[0].type == A.binary() + table = pa.Table.from_pandas(df) + assert table[0].type == pa.binary() values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] expected = pd.DataFrame({'strings': values2}) @@ -256,8 +256,8 @@ class TestPandasConversion(unittest.TestCase): def test_fixed_size_bytes(self): values = [b'foo', None, b'bar', None, None, b'hey'] df = pd.DataFrame({'strings': values}) - schema = A.Schema.from_fields([A.field('strings', A.binary(3))]) - table = A.Table.from_pandas(df, schema=schema) + schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))]) + table = pa.Table.from_pandas(df, schema=schema) assert table.schema[0].type == schema[0].type assert table.schema[0].name == schema[0].name result = table.to_pandas() @@ -266,9 +266,9 @@ class TestPandasConversion(unittest.TestCase): def test_fixed_size_bytes_does_not_accept_varying_lengths(self): values = [b'foo', None, b'ba', None, None, b'hey'] df = pd.DataFrame({'strings': values}) - schema = A.Schema.from_fields([A.field('strings', A.binary(3))]) - with self.assertRaises(A.ArrowInvalid): - A.Table.from_pandas(df, schema=schema) + schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))]) + with self.assertRaises(pa.ArrowInvalid): + pa.Table.from_pandas(df, schema=schema) def test_timestamps_notimezone_no_nulls(self): df = pd.DataFrame({ @@ -278,8 +278,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - field = A.Field.from_py('datetime64', A.timestamp('ms')) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('datetime64', pa.timestamp('ms')) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) @@ -290,8 +290,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - field = A.Field.from_py('datetime64', A.timestamp('ns')) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('datetime64', pa.timestamp('ns')) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema) @@ -303,8 +303,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - field = A.Field.from_py('datetime64', A.timestamp('ms')) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('datetime64', pa.timestamp('ms')) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) @@ -315,8 +315,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - field = A.Field.from_py('datetime64', A.timestamp('ns')) - schema = A.Schema.from_fields([field]) + field = pa.Field.from_py('datetime64', pa.timestamp('ns')) + schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema) @@ -345,25 +345,77 @@ class TestPandasConversion(unittest.TestCase): .to_frame()) self._check_pandas_roundtrip(df, timestamps_to_ms=False) - def test_date(self): + def test_date_infer(self): df = pd.DataFrame({ 'date': [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]}) - table = A.Table.from_pandas(df) - field = A.Field.from_py('date', A.date64()) - schema = A.Schema.from_fields([field]) + table = pa.Table.from_pandas(df) + field = pa.Field.from_py('date', pa.date32()) + schema = pa.Schema.from_fields([field]) assert table.schema.equals(schema) result = table.to_pandas() expected = df.copy() expected['date'] = pd.to_datetime(df['date']) tm.assert_frame_equal(result, expected) + def test_date_objects_typed(self): + arr = np.array([ + datetime.date(2017, 4, 3), + None, + datetime.date(2017, 4, 4), + datetime.date(2017, 4, 5)], dtype=object) + + arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32') + arr_i8 = arr_i4.astype('int64') * 86400000 + mask = np.array([False, True, False, False]) + + t32 = pa.date32() + t64 = pa.date64() + + a32 = pa.Array.from_numpy(arr, type=t32) + a64 = pa.Array.from_numpy(arr, type=t64) + + a32_expected = pa.Array.from_numpy(arr_i4, mask=mask, type=t32) + a64_expected = pa.Array.from_numpy(arr_i8, mask=mask, type=t64) + + assert a32.equals(a32_expected) + assert a64.equals(a64_expected) + + # Test converting back to pandas + colnames = ['date32', 'date64'] + table = pa.Table.from_arrays([a32, a64], colnames) + table_pandas = table.to_pandas() + + ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04', + '2017-04-05'], + dtype='datetime64[D]') + .astype('datetime64[ns]')) + ex_values[1] = pd.NaT.value + expected_pandas = pd.DataFrame({'date32': ex_values, + 'date64': ex_values}, + columns=colnames) + tm.assert_frame_equal(table_pandas, expected_pandas) + + def test_dates_from_integers(self): + t1 = pa.date32() + t2 = pa.date64() + + arr = np.array([17259, 17260, 17261], dtype='int32') + arr2 = arr.astype('int64') * 86400000 + + a1 = pa.Array.from_numpy(arr, type=t1) + a2 = pa.Array.from_numpy(arr2, type=t2) + + expected = datetime.date(2017, 4, 3) + assert a1[0].as_py() == expected + assert a2[0].as_py() == expected + def test_column_of_arrays(self): df, schema = dataframe_with_arrays() self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) - table = A.Table.from_pandas(df, schema=schema) + table = pa.Table.from_pandas(df, schema=schema) assert table.schema.equals(schema) for column in df.columns: @@ -373,7 +425,7 @@ class TestPandasConversion(unittest.TestCase): def test_column_of_lists(self): df, schema = dataframe_with_lists() self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) - table = A.Table.from_pandas(df, schema=schema) + table = pa.Table.from_pandas(df, schema=schema) assert table.schema.equals(schema) for column in df.columns: @@ -410,8 +462,8 @@ class TestPandasConversion(unittest.TestCase): def test_mixed_types_fails(self): data = pd.DataFrame({'a': ['a', 1, 2.0]}) - with self.assertRaises(A.ArrowException): - A.Table.from_pandas(data) + with self.assertRaises(pa.ArrowException): + pa.Table.from_pandas(data) def test_strided_data_import(self): cases = [] @@ -460,9 +512,9 @@ class TestPandasConversion(unittest.TestCase): decimal.Decimal('1234.439'), ] }) - converted = A.Table.from_pandas(expected) - field = A.Field.from_py('decimals', A.decimal(7, 3)) - schema = A.Schema.from_fields([field]) + converted = pa.Table.from_pandas(expected) + field = pa.Field.from_py('decimals', pa.decimal(7, 3)) + schema = pa.Schema.from_fields([field]) assert converted.schema.equals(schema) def test_decimal_32_to_pandas(self): @@ -472,7 +524,7 @@ class TestPandasConversion(unittest.TestCase): decimal.Decimal('1234.439'), ] }) - converted = A.Table.from_pandas(expected) + converted = pa.Table.from_pandas(expected) df = converted.to_pandas() tm.assert_frame_equal(df, expected) @@ -483,9 +535,9 @@ class TestPandasConversion(unittest.TestCase): decimal.Decimal('129534.123731'), ] }) - converted = A.Table.from_pandas(expected) - field = A.Field.from_py('decimals', A.decimal(12, 6)) - schema = A.Schema.from_fields([field]) + converted = pa.Table.from_pandas(expected) + field = pa.Field.from_py('decimals', pa.decimal(12, 6)) + schema = pa.Schema.from_fields([field]) assert converted.schema.equals(schema) def test_decimal_64_to_pandas(self): @@ -495,7 +547,7 @@ class TestPandasConversion(unittest.TestCase): decimal.Decimal('129534.123731'), ] }) - converted = A.Table.from_pandas(expected) + converted = pa.Table.from_pandas(expected) df = converted.to_pandas() tm.assert_frame_equal(df, expected) @@ -506,9 +558,9 @@ class TestPandasConversion(unittest.TestCase): -decimal.Decimal('314292388910493.12343437128'), ] }) - converted = A.Table.from_pandas(expected) - field = A.Field.from_py('decimals', A.decimal(26, 11)) - schema = A.Schema.from_fields([field]) + converted = pa.Table.from_pandas(expected) + field = pa.Field.from_py('decimals', pa.decimal(26, 11)) + schema = pa.Schema.from_fields([field]) assert converted.schema.equals(schema) def test_decimal_128_to_pandas(self): @@ -518,6 +570,6 @@ class TestPandasConversion(unittest.TestCase): -decimal.Decimal('314292388910493.12343437128'), ] }) - converted = A.Table.from_pandas(expected) + converted = pa.Table.from_pandas(expected) df = converted.to_pandas() tm.assert_frame_equal(df, expected)