This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new fec7a0e ARROW-4788: [C++] Less verbose API for constructing StructArray fec7a0e is described below commit fec7a0eb06ff033ceede7aff75865fc1eceab7bb Author: Antoine Pitrou <anto...@python.org> AuthorDate: Thu Jun 27 22:32:08 2019 -0500 ARROW-4788: [C++] Less verbose API for constructing StructArray Also our first API to return a `Result<>`. Includes appropriate massaging in the test helpers so that ASSERT_OK and ASSERT_RAISES can handle `Result<>` instances. Author: Antoine Pitrou <anto...@python.org> Closes #4707 from pitrou/ARROW-4788-struct-array-factory and squashes the following commits: ae5435857 <Antoine Pitrou> ARROW-4788: Less verbose API for constructing StructArray --- cpp/src/arrow/array-struct-test.cc | 57 +++++++++++++++++++++ cpp/src/arrow/array-view-test.cc | 1 + cpp/src/arrow/array.cc | 28 ++++++++++ cpp/src/arrow/array.h | 12 +++++ cpp/src/arrow/compute/kernels/cast-test.cc | 6 +-- cpp/src/arrow/csv/converter-test.cc | 1 + cpp/src/arrow/extension_type-test.cc | 1 + cpp/src/arrow/flight/flight-test.cc | 2 +- cpp/src/arrow/python/common.h | 17 +++++++ cpp/src/arrow/python/pyarrow.cc | 5 ++ cpp/src/arrow/python/pyarrow.h | 5 ++ cpp/src/arrow/python/pyarrow_api.h | 18 +++++-- cpp/src/arrow/python/pyarrow_lib.h | 13 +---- cpp/src/arrow/python/python-test.cc | 1 + cpp/src/arrow/result.h | 8 +++ cpp/src/arrow/testing/gtest_util.h | 62 ++++++++++++++++------- cpp/src/arrow/util/trie-test.cc | 2 +- cpp/src/gandiva/precompiled/decimal_ops_test.cc | 2 +- cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 1 + python/pyarrow/_flight.pyx | 12 +++-- python/pyarrow/array.pxi | 42 ++++++--------- python/pyarrow/error.pxi | 6 +++ python/pyarrow/includes/common.pxd | 8 +++ python/pyarrow/includes/libarrow.pxd | 12 ++++- python/pyarrow/includes/libarrow_flight.pxd | 11 ++-- python/pyarrow/tests/test_array.py | 20 ++++++++ 26 files changed, 279 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index a182202..a5fce6e 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -76,6 +76,63 @@ void ValidateBasicStructArray(const StructArray* result, } } +TEST(StructArray, FromFieldNames) { + std::shared_ptr<Array> a, b, c, array, expected; + a = ArrayFromJSON(int32(), "[4, null]"); + b = ArrayFromJSON(utf8(), R"([null, "foo"])"); + + auto res = StructArray::Make({a, b}, {"a", "b"}); + ASSERT_OK(res); + array = *res; + expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), + R"([{"a": 4, "b": null}, {"a": null, "b": "foo"}])"); + AssertArraysEqual(*array, *expected); + + // With non-zero offsets + res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0, + /*offset =*/1); + ASSERT_OK(res); + array = *res; + expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), + R"([{"a": null, "b": "foo"}])"); + AssertArraysEqual(*array, *expected); + + res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0, + /*offset =*/2); + ASSERT_OK(res); + array = *res; + expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), R"([])"); + AssertArraysEqual(*array, *expected); + + // Offset greater than length + res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0, + /*offset =*/3); + ASSERT_RAISES(IndexError, res); + + // With null bitmap + std::shared_ptr<Buffer> null_bitmap; + BitmapFromVector<bool>({false, true}, &null_bitmap); + res = StructArray::Make({a, b}, {"a", "b"}, null_bitmap); + ASSERT_OK(res); + array = *res; + expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), + R"([null, {"a": null, "b": "foo"}])"); + AssertArraysEqual(*array, *expected); + + // Mismatching array lengths + c = ArrayFromJSON(int64(), "[1, 2, 3]"); + res = StructArray::Make({a, c}, {"a", "c"}); + ASSERT_RAISES(Invalid, res); + + // Mismatching number of fields + res = StructArray::Make({a, b}, {"a", "b", "c"}); + ASSERT_RAISES(Invalid, res); + + // Fail on 0 children (cannot infer array length) + res = StructArray::Make({}, {}); + ASSERT_RAISES(Invalid, res); +} + // ---------------------------------------------------------------------------------- // Struct test class TestStructBuilder : public TestBuilder { diff --git a/cpp/src/arrow/array-view-test.cc b/cpp/src/arrow/array-view-test.cc index df26ad3..f466c1d 100644 --- a/cpp/src/arrow/array-view-test.cc +++ b/cpp/src/arrow/array-view-test.cc @@ -24,6 +24,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 9b66af2..0f63aba 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -497,6 +497,34 @@ StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length, boxed_fields_.resize(children.size()); } +Result<std::shared_ptr<Array>> StructArray::Make( + const std::vector<std::shared_ptr<Array>>& children, + const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap, + int64_t null_count, int64_t offset) { + if (children.size() != field_names.size()) { + return Status::Invalid("Mismatching number of field names and child arrays"); + } + int64_t length = 0; + if (children.size() == 0) { + return Status::Invalid("Can't infer struct array length with 0 child arrays"); + } + length = children.front()->length(); + for (const auto& child : children) { + if (length != child->length()) { + return Status::Invalid("Mismatching child array lengths"); + } + } + if (offset > length) { + return Status::IndexError("Offset greater than length of child arrays"); + } + std::vector<std::shared_ptr<Field>> fields(children.size()); + for (size_t i = 0; i < children.size(); ++i) { + fields[i] = ::arrow::field(field_names[i], children[i]->type()); + } + return std::make_shared<StructArray>(struct_(fields), length - offset, children, + null_bitmap, null_count, offset); +} + const StructType* StructArray::struct_type() const { return checked_cast<const StructType*>(data_->type.get()); } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 1e163b7..2a1ce7a 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -28,6 +28,7 @@ #include "arrow/buffer.h" #include "arrow/compare.h" +#include "arrow/result.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -822,6 +823,16 @@ class ARROW_EXPORT StructArray : public Array { std::shared_ptr<Buffer> null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); + /// \brief Return a StructArray from child arrays and field names. + /// + /// The length and data type are automatically inferred from the arguments. + /// There should be at least one child array. + static Result<std::shared_ptr<Array>> Make( + const std::vector<std::shared_ptr<Array>>& children, + const std::vector<std::string>& field_names, + std::shared_ptr<Buffer> null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + const StructType* struct_type() const; // Return a shared pointer in case the requestor desires to share ownership @@ -840,6 +851,7 @@ class ARROW_EXPORT StructArray : public Array { private: // For caching boxed child data + // XXX This is not handled in a thread-safe manner. mutable std::vector<std::shared_ptr<Array>> boxed_fields_; }; diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 778544c..510b9a4 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -99,10 +99,10 @@ class TestCast : public ComputeFixture, public TestBase { const std::vector<I_TYPE>& in_values, const std::vector<bool>& is_valid, const std::shared_ptr<DataType>& out_type, const std::vector<O_TYPE>& out_values, const CastOptions& options) { - DCHECK_EQ(in_values.size(), out_values.size()); + ASSERT_EQ(in_values.size(), out_values.size()); std::shared_ptr<Array> input, expected; if (is_valid.size() > 0) { - DCHECK_EQ(is_valid.size(), out_values.size()); + ASSERT_EQ(is_valid.size(), out_values.size()); ArrayFromVector<InType, I_TYPE>(in_type, is_valid, in_values, &input); ArrayFromVector<OutType, O_TYPE>(out_type, is_valid, out_values, &expected); } else { @@ -123,7 +123,7 @@ class TestCast : public ComputeFixture, public TestBase { const CastOptions& options = CastOptions()) { std::shared_ptr<Array> input = ArrayFromJSON(in_type, in_json); std::shared_ptr<Array> expected = ArrayFromJSON(out_type, expected_json); - DCHECK_EQ(input->length(), expected->length()); + ASSERT_EQ(input->length(), expected->length()); CheckPass(*input, *expected, out_type, options); // Check a sliced variant diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc index 531f40b..a5e4c03 100644 --- a/cpp/src/arrow/csv/converter-test.cc +++ b/cpp/src/arrow/csv/converter-test.cc @@ -31,6 +31,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/util/decimal.h" +#include "arrow/util/logging.h" namespace arrow { namespace csv { diff --git a/cpp/src/arrow/extension_type-test.cc b/cpp/src/arrow/extension_type-test.cc index 6b632a9..29c5356 100644 --- a/cpp/src/arrow/extension_type-test.cc +++ b/cpp/src/arrow/extension_type-test.cc @@ -43,6 +43,7 @@ #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/flight/flight-test.cc b/cpp/src/arrow/flight/flight-test.cc index c0f0c7f..d036b9a 100644 --- a/cpp/src/arrow/flight/flight-test.cc +++ b/cpp/src/arrow/flight/flight-test.cc @@ -197,7 +197,7 @@ class TestFlightClient : public ::testing::Test { void CheckDoGet(const FlightDescriptor& descr, const BatchVector& expected_batches, EndpointCheckFunc&& check_endpoints) { auto num_batches = static_cast<int>(expected_batches.size()); - DCHECK_GE(num_batches, 2); + ASSERT_GE(num_batches, 2); auto expected_schema = expected_batches[0]->schema(); std::unique_ptr<FlightInfo> info; diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index a10e3bb..766b764 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -24,12 +24,15 @@ #include "arrow/python/config.h" #include "arrow/buffer.h" +#include "arrow/python/pyarrow.h" #include "arrow/python/visibility.h" #include "arrow/util/macros.h" namespace arrow { class MemoryPool; +template <class T> +class Result; namespace py { @@ -52,6 +55,20 @@ ARROW_PYTHON_EXPORT Status PassPyError(); #define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE)); +// For Cython, as you can't define template C++ functions in Cython, only use them. +// This function can set a Python exception. It assumes that T has a (cheap) +// default constructor. +template <class T> +T GetResultValue(Result<T>& result) { + if (ARROW_PREDICT_TRUE(result.ok())) { + return *std::move(result); + } else { + int r = internal::check_status(result.status()); + assert(r == -1); // should have errored out + return {}; + } +} + // A RAII-style helper that ensures the GIL is acquired inside a lexical block. class ARROW_PYTHON_EXPORT PyAcquireGIL { public: diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc index d080cc0..1cedc54 100644 --- a/cpp/src/arrow/python/pyarrow.cc +++ b/cpp/src/arrow/python/pyarrow.cc @@ -168,5 +168,10 @@ PyObject* wrap_record_batch(const std::shared_ptr<RecordBatch>& batch) { return ::pyarrow_wrap_batch(batch); } +namespace internal { + +int check_status(const Status& status) { return ::pyarrow_internal_check_status(status); } + +} // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h index 5e42333..ff5bf8f 100644 --- a/cpp/src/arrow/python/pyarrow.h +++ b/cpp/src/arrow/python/pyarrow.h @@ -81,6 +81,11 @@ ARROW_PYTHON_EXPORT Status unwrap_record_batch(PyObject* batch, ARROW_PYTHON_EXPORT PyObject* wrap_record_batch( const std::shared_ptr<RecordBatch>& batch); +namespace internal { + +ARROW_PYTHON_EXPORT int check_status(const Status& status); + +} // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pyarrow_api.h b/cpp/src/arrow/python/pyarrow_api.h index f6a2112..b76e961 100644 --- a/cpp/src/arrow/python/pyarrow_api.h +++ b/cpp/src/arrow/python/pyarrow_api.h @@ -16,16 +16,22 @@ // under the License. // DO NOT EDIT THIS FILE. Update from pyarrow/lib_api.h after pyarrow build +// This is used to be able to call back into Cython code from C++. -/* Generated by Cython 0.29 */ +/* Generated by Cython 0.29.8 */ #ifndef __PYX_HAVE_API__pyarrow__lib #define __PYX_HAVE_API__pyarrow__lib +#ifdef __MINGW64__ +#define MS_WIN64 +#endif #include "Python.h" #include "pyarrow_lib.h" static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0; #define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray> const &) = 0; +#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0; #define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0; @@ -62,6 +68,8 @@ static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwra #define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0; #define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const &) = 0; +#define pyarrow_internal_check_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0; #define pyarrow_is_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type)(PyObject *) = 0; @@ -72,8 +80,8 @@ static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema)(PyObject *) = 0; #define pyarrow_is_schema __pyx_api_f_7pyarrow_3lib_pyarrow_is_schema static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_array)(PyObject *) = 0; #define pyarrow_is_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_array -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray> const &) = 0; -#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_ptr< arrow::Scalar> const &) = 0; +#define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0; #define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0; @@ -149,6 +157,7 @@ static int import_pyarrow__lib(void) { module = PyImport_ImportModule("pyarrow.lib"); if (!module) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column, "PyObject *(std::shared_ptr< arrow::Column> const &)") < 0) goto bad; @@ -167,12 +176,13 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_internal_check_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int (arrow::Status const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_field, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject *(std::shared_ptr< arrow::Scalar> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; diff --git a/cpp/src/arrow/python/pyarrow_lib.h b/cpp/src/arrow/python/pyarrow_lib.h index 4a99a07..5f5fc4c 100644 --- a/cpp/src/arrow/python/pyarrow_lib.h +++ b/cpp/src/arrow/python/pyarrow_lib.h @@ -17,7 +17,7 @@ // DO NOT EDIT THIS FILE. Update from pyarrow/lib.h after pyarrow build -/* Generated by Cython 0.29 */ +/* Generated by Cython 0.29.8 */ #ifndef __PYX_HAVE__pyarrow__lib #define __PYX_HAVE__pyarrow__lib @@ -38,6 +38,7 @@ #endif __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_array(std::shared_ptr< arrow::Array> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_batch(std::shared_ptr< arrow::RecordBatch> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_buffer(std::shared_ptr< arrow::Buffer> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_column(std::shared_ptr< arrow::Column> const &); @@ -56,16 +57,6 @@ __PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unw __PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); -__PYX_EXTERN_C int pyarrow_is_buffer(PyObject *); -__PYX_EXTERN_C int pyarrow_is_data_type(PyObject *); -__PYX_EXTERN_C int pyarrow_is_field(PyObject *); -__PYX_EXTERN_C int pyarrow_is_schema(PyObject *); -__PYX_EXTERN_C int pyarrow_is_array(PyObject *); -__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); -__PYX_EXTERN_C int pyarrow_is_tensor(PyObject *); -__PYX_EXTERN_C int pyarrow_is_column(PyObject *); -__PYX_EXTERN_C int pyarrow_is_table(PyObject *); -__PYX_EXTERN_C int pyarrow_is_batch(PyObject *); #endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index a8282a5..5de613f 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -32,6 +32,7 @@ #include "arrow/python/helpers.h" #include "arrow/python/python_to_arrow.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index f01513a..c1fbd0a 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -333,4 +333,12 @@ class Result { #define ARROW_ASSIGN_OR_RAISE(lhs, rexpr) \ ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \ lhs, rexpr); + +namespace internal { + +// For Cython, because of https://github.com/cython/cython/issues/3018 +template <class T> +using CResult = Result<T>; + +} // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index c44bb17..914aeb0 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -34,13 +34,35 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" -#include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" +namespace arrow { + +template <typename T> +class Result; + +namespace internal { + +// Helpers for the ASSERT* macros below + +inline Status GenericToStatus(const Status& st) { return st; } + +template <typename T> +inline Status GenericToStatus(const Result<T>& res) { + return res.status(); +} + +} // namespace internal +} // namespace arrow + +// NOTE: failing must be inline in the macros below, to get correct file / line number +// reporting on test failures. + #define ASSERT_RAISES(ENUM, expr) \ do { \ - ::arrow::Status _st = (expr); \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ if (!_st.Is##ENUM()) { \ FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ ENUM) ", but got " \ @@ -50,7 +72,8 @@ #define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ do { \ - ::arrow::Status _st = (expr); \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ if (!_st.Is##ENUM()) { \ FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ ENUM) ", but got " \ @@ -61,7 +84,8 @@ #define ASSERT_OK(expr) \ do { \ - ::arrow::Status _st = (expr); \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ if (!_st.ok()) { \ FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \ } \ @@ -69,18 +93,20 @@ #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) -#define ARROW_EXPECT_OK(expr) \ - do { \ - ::arrow::Status _st = (expr); \ - EXPECT_TRUE(_st.ok()); \ +#define ARROW_EXPECT_OK(expr) \ + do { \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + EXPECT_TRUE(_st.ok()); \ } while (false) -#define ABORT_NOT_OK(s) \ - do { \ - ::arrow::Status _st = (s); \ - if (ARROW_PREDICT_FALSE(!_st.ok())) { \ - _st.Abort(); \ - } \ +#define ABORT_NOT_OK(expr) \ + do { \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + if (ARROW_PREDICT_FALSE(!_st.ok())) { \ + _st.Abort(); \ + } \ } while (false); namespace arrow { @@ -170,7 +196,8 @@ template <typename TYPE, typename C_TYPE = typename TYPE::c_type> void ArrayFromVector(const std::shared_ptr<DataType>& type, const std::vector<bool>& is_valid, const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) { - DCHECK_EQ(TYPE::type_id, type->id()) + auto type_id = TYPE::type_id; + ASSERT_EQ(type_id, type->id()) << "template parameter and concrete DataType instance don't agree"; std::unique_ptr<ArrayBuilder> builder_ptr; @@ -191,7 +218,8 @@ void ArrayFromVector(const std::shared_ptr<DataType>& type, template <typename TYPE, typename C_TYPE = typename TYPE::c_type> void ArrayFromVector(const std::shared_ptr<DataType>& type, const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) { - DCHECK_EQ(TYPE::type_id, type->id()) + auto type_id = TYPE::type_id; + ASSERT_EQ(type_id, type->id()) << "template parameter and concrete DataType instance don't agree"; std::unique_ptr<ArrayBuilder> builder_ptr; @@ -228,7 +256,7 @@ void ChunkedArrayFromVector(const std::shared_ptr<DataType>& type, const std::vector<std::vector<C_TYPE>>& values, std::shared_ptr<ChunkedArray>* out) { ArrayVector chunks; - DCHECK_EQ(is_valid.size(), values.size()); + ASSERT_EQ(is_valid.size(), values.size()); for (size_t i = 0; i < values.size(); ++i) { std::shared_ptr<Array> array; ArrayFromVector<TYPE, C_TYPE>(type, is_valid[i], values[i], &array); diff --git a/cpp/src/arrow/util/trie-test.cc b/cpp/src/arrow/util/trie-test.cc index bcbc77e..80a8a72 100644 --- a/cpp/src/arrow/util/trie-test.cc +++ b/cpp/src/arrow/util/trie-test.cc @@ -267,7 +267,7 @@ TEST(Trie, CapacityError) { s[2] = third; auto st = builder.Append(reinterpret_cast<const char*>(s)); if (st.IsCapacityError()) { - DCHECK_GE(first, 2); + ASSERT_GE(first, 2); had_capacity_error = true; break; } else { diff --git a/cpp/src/gandiva/precompiled/decimal_ops_test.cc b/cpp/src/gandiva/precompiled/decimal_ops_test.cc index 6c8d762..b4fe2e6 100644 --- a/cpp/src/gandiva/precompiled/decimal_ops_test.cc +++ b/cpp/src/gandiva/precompiled/decimal_ops_test.cc @@ -180,7 +180,7 @@ void TestDecimalSql::VerifyAllSign(DecimalTypeUtil::Op op, const DecimalScalar12 // both -ve Verify(op, -left, -right, -expected_output, expected_overflow); } else { - DCHECK(op == DecimalTypeUtil::kOpMultiply || op == DecimalTypeUtil::kOpDivide); + ASSERT_TRUE(op == DecimalTypeUtil::kOpMultiply || op == DecimalTypeUtil::kOpDivide); // right -ve Verify(op, left, -right, -expected_output, expected_overflow); diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 792f760..2cc8b0d 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -34,6 +34,7 @@ #include "arrow/testing/util.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" +#include "arrow/util/logging.h" #include "parquet/api/reader.h" #include "parquet/api/writer.h" diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index 7fc4ed4..a0d08f8 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -127,7 +127,7 @@ class ActionType(_ActionType): cdef class Result: """A result from executing an Action.""" cdef: - unique_ptr[CResult] result + unique_ptr[CFlightResult] result def __init__(self, buf): """Create a new result. @@ -136,7 +136,7 @@ cdef class Result: ---------- buf : Buffer or bytes-like object """ - self.result.reset(new CResult()) + self.result.reset(new CFlightResult()) self.result.get().body = pyarrow_unwrap_buffer(as_buffer(buf)) @property @@ -1145,14 +1145,18 @@ cdef void _do_get(void* self, const CServerCallContext& context, cdef void _do_action_result_next(void* self, - unique_ptr[CResult]* result) except *: + unique_ptr[CFlightResult]* result) except *: """Callback for implementing Flight servers in Python.""" + cdef: + CFlightResult* c_result + try: action_result = next(<object> self) if not isinstance(action_result, Result): raise TypeError("Result of FlightServerBase.do_action must " "return an iterator of Result objects") - result.reset(new CResult(deref((<Result> action_result).result.get()))) + c_result = (<Result> action_result).result.get() + result.reset(new CFlightResult(deref(c_result))) except StopIteration: result.reset(nullptr) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 189e8a0..5ae178d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1456,10 +1456,10 @@ cdef class StructArray(Array): result : StructArray """ cdef: - Array array shared_ptr[CArray] c_array vector[shared_ptr[CArray]] c_arrays - shared_ptr[CArray] c_result + vector[c_string] c_names + CResult[shared_ptr[CArray]] c_result ssize_t num_arrays ssize_t length ssize_t i @@ -1469,30 +1469,20 @@ cdef class StructArray(Array): raise ValueError('Names are currently required') arrays = [asarray(x) for x in arrays] - - num_arrays = len(arrays) - if num_arrays != len(names): - raise ValueError("The number of names should be equal to the " - "number of arrays") - - if num_arrays > 0: - length = len(arrays[0]) - c_arrays.resize(num_arrays) - for i in range(num_arrays): - array = arrays[i] - if len(array) != length: - raise ValueError("All arrays must have the same length") - c_arrays[i] = array.sp_array - else: - length = 0 - - struct_type = struct([ - field(name, array.type) for name, array in zip(names, arrays) - ]) - - c_result.reset(new CStructArray(struct_type.sp_type, length, c_arrays)) - - return pyarrow_wrap_array(c_result) + for arr in arrays: + c_arrays.push_back(pyarrow_unwrap_array(arr)) + for name in names: + c_names.push_back(tobytes(name)) + + if c_arrays.size() == 0 and c_names.size() == 0: + # The C++ side doesn't allow this + return array([], struct([])) + + # XXX Cannot pass "nullptr" for a shared_ptr<T> argument: + # https://github.com/cython/cython/issues/3020 + c_result = CStructArray.Make(c_arrays, c_names, + shared_ptr[CBuffer](), -1, 0) + return pyarrow_wrap_array(GetResultValue(c_result)) cdef class ExtensionArray(Array): diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi index 43b1933..7b5e8d4 100644 --- a/python/pyarrow/error.pxi +++ b/python/pyarrow/error.pxi @@ -108,3 +108,9 @@ cdef int check_status(const CStatus& status) nogil except -1: else: message = frombytes(status.ToString()) raise ArrowException(message) + + +# This is an API function for C++ PyArrow +cdef api int pyarrow_internal_check_status(const CStatus& status) \ + nogil except -1: + return check_status(status) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 452b796..4a06fc8 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -69,6 +69,14 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsPlasmaObjectNonexistent() c_bool IsPlasmaStoreFull() +cdef extern from "arrow/result.h" namespace "arrow::internal" nogil: + cdef cppclass CResult[T]: + c_bool ok() + CStatus status() + T operator*() + +cdef extern from "arrow/python/common.h" namespace "arrow::py": + T GetResultValue[T](CResult[T]) except * cdef inline object PyObject_to_object(PyObject* o): # Cast to "object" increments reference count diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 79389ca..8798834 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -453,9 +453,19 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStructArray(shared_ptr[CDataType] type, int64_t length, vector[shared_ptr[CArray]] children, shared_ptr[CBuffer] null_bitmap=nullptr, - int64_t null_count=0, + int64_t null_count=-1, int64_t offset=0) + # XXX Cython crashes if default argument values are declared here + # https://github.com/cython/cython/issues/2167 + @staticmethod + CResult[shared_ptr[CArray]] Make( + vector[shared_ptr[CArray]] children, + vector[c_string] field_names, + shared_ptr[CBuffer] null_bitmap, + int64_t null_count, + int64_t offset) + shared_ptr[CArray] field(int pos) shared_ptr[CArray] GetFieldByName(const c_string& name) const diff --git a/python/pyarrow/includes/libarrow_flight.pxd b/python/pyarrow/includes/libarrow_flight.pxd index 49a5153..0639574 100644 --- a/python/pyarrow/includes/libarrow_flight.pxd +++ b/python/pyarrow/includes/libarrow_flight.pxd @@ -46,13 +46,13 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: c_string type shared_ptr[CBuffer] body - cdef cppclass CResult" arrow::flight::Result": - CResult() - CResult(CResult) + cdef cppclass CFlightResult" arrow::flight::Result": + CFlightResult() + CFlightResult(CFlightResult) shared_ptr[CBuffer] body cdef cppclass CResultStream" arrow::flight::ResultStream": - CStatus Next(unique_ptr[CResult]* result) + CStatus Next(unique_ptr[CFlightResult]* result) cdef cppclass CDescriptorType \ " arrow::flight::FlightDescriptor::DescriptorType": @@ -247,7 +247,7 @@ ctypedef void cb_do_action(object, const CServerCallContext&, const CAction&, unique_ptr[CResultStream]*) ctypedef void cb_list_actions(object, const CServerCallContext&, vector[CActionType]*) -ctypedef void cb_result_next(object, unique_ptr[CResult]*) +ctypedef void cb_result_next(object, unique_ptr[CFlightResult]*) ctypedef void cb_data_stream_next(object, CFlightPayload*) ctypedef void cb_server_authenticate(object, CServerAuthSender*, CServerAuthReader*) @@ -316,6 +316,7 @@ cdef extern from "arrow/python/flight.h" namespace "arrow::py::flight" nogil: int64_t total_bytes, unique_ptr[CFlightInfo]* out) + cdef extern from "<utility>" namespace "std": unique_ptr[CFlightDataStream] move(unique_ptr[CFlightDataStream]) nogil unique_ptr[CServerAuthHandler] move(unique_ptr[CServerAuthHandler]) nogil diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 80625ba..9d66d96 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -387,6 +387,26 @@ def test_struct_from_buffers(): children=children[:1]) +def test_struct_from_arrays(): + a = pa.array([4, 5, None]) + b = pa.array(["bar", None, ""]) + c = pa.array([[1, 2], None, [3, None]]) + + arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"]) + assert arr.to_pylist() == [ + {'a': 4, 'b': 'bar', 'c': [1, 2]}, + {'a': 5, 'b': None, 'c': None}, + {'a': None, 'b': '', 'c': [3, None]}, + ] + + with pytest.raises(ValueError): + pa.StructArray.from_arrays([a, b, c], ["a", "b"]) + + arr = pa.StructArray.from_arrays([], []) + assert arr.type == pa.struct([]) + assert arr.to_pylist() == [] + + def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)