This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b116b8ab2f GH-21761: [Python] accept pyarrow scalars in array
constructor (#36162)
b116b8ab2f is described below
commit b116b8ab2f1a4455e2110f8a936f481286c52b13
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Jul 5 11:08:30 2023 +0200
GH-21761: [Python] accept pyarrow scalars in array constructor (#36162)
### Rationale for this change
Currently, `pyarrow.array `doesn't accept list of pyarrow Scalars and this
PR adds a check to allow that.
* Closes: #21761
Lead-authored-by: AlenkaF <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/src/arrow/python/inference.cc | 25 ++++
python/pyarrow/src/arrow/python/python_to_arrow.cc | 39 ++++++
python/pyarrow/tests/test_convert_builtin.py | 138 +++++++++++++++++++++
3 files changed, 202 insertions(+)
diff --git a/python/pyarrow/src/arrow/python/inference.cc
b/python/pyarrow/src/arrow/python/inference.cc
index db5f0896a9..3407b32720 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -27,6 +27,7 @@
#include <utility>
#include <vector>
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
@@ -340,6 +341,7 @@ class TypeInferrer {
decimal_count_(0),
list_count_(0),
struct_count_(0),
+ arrow_scalar_count_(0),
numpy_dtype_count_(0),
interval_count_(0),
max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
@@ -391,6 +393,8 @@ class TypeInferrer {
} else if (PyUnicode_Check(obj)) {
++unicode_count_;
*keep_going = make_unions_;
+ } else if (arrow::py::is_scalar(obj)) {
+ RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
} else if (PyArray_CheckAnyScalarExact(obj)) {
RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
} else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
@@ -455,6 +459,12 @@ class TypeInferrer {
RETURN_NOT_OK(Validate());
+ if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ !=
total_count_) {
+ return Status::Invalid(
+ "pyarrow scalars cannot be mixed "
+ "with other Python scalar values currently");
+ }
+
if (numpy_dtype_count_ > 0) {
// All NumPy scalars and Nones/nulls
if (numpy_dtype_count_ + none_count_ == total_count_) {
@@ -534,6 +544,8 @@ class TypeInferrer {
*out = utf8();
} else if (interval_count_) {
*out = month_day_nano_interval();
+ } else if (arrow_scalar_count_) {
+ *out = scalar_type_;
} else {
*out = null();
}
@@ -560,6 +572,17 @@ class TypeInferrer {
return Status::OK();
}
+ Status VisitArrowScalar(PyObject* obj, bool* keep_going /* unused */) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, arrow::py::unwrap_scalar(obj));
+ // Check that all the scalar types for the sequence are the same
+ if (arrow_scalar_count_ > 0 && *scalar->type != *scalar_type_) {
+ return internal::InvalidValue(obj, "cannot mix scalars with different
types");
+ }
+ scalar_type_ = scalar->type;
+ ++arrow_scalar_count_;
+ return Status::OK();
+ }
+
Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
// Continue visiting dtypes for now.
// TODO(wesm): devise approach for unions
@@ -675,10 +698,12 @@ class TypeInferrer {
int64_t decimal_count_;
int64_t list_count_;
int64_t struct_count_;
+ int64_t arrow_scalar_count_;
int64_t numpy_dtype_count_;
int64_t interval_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
std::map<std::string, TypeInferrer> struct_inferrers_;
+ std::shared_ptr<DataType> scalar_type_;
// If we observe a strongly-typed value in e.g. a NumPy array, we can store
// it here to skip the type counting logic above
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 4f7420d829..486bd84077 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -28,6 +28,7 @@
#include <vector>
#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_dict.h"
@@ -36,6 +37,7 @@
#include "arrow/array/builder_time.h"
#include "arrow/chunked_array.h"
#include "arrow/result.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
@@ -599,6 +601,15 @@ class PyPrimitiveConverter<T, enable_if_null<T>>
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
return this->primitive_builder_->AppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ if (scalar->is_valid) {
+ return Status::Invalid("Cannot append scalar of type ",
scalar->type->ToString(),
+ " to builder for type null");
+ } else {
+ return this->primitive_builder_->AppendNull();
+ }
} else {
ARROW_ASSIGN_OR_RAISE(
auto converted, PyValue::Convert(this->primitive_type_,
this->options_, value));
@@ -620,6 +631,10 @@ class PyPrimitiveConverter<
// rely on the Unsafe builder API which improves the performance.
if (PyValue::IsNull(this->options_, value)) {
this->primitive_builder_->UnsafeAppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
} else {
ARROW_ASSIGN_OR_RAISE(
auto converted, PyValue::Convert(this->primitive_type_,
this->options_, value));
@@ -637,6 +652,10 @@ class PyPrimitiveConverter<
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
this->primitive_builder_->UnsafeAppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
} else {
ARROW_ASSIGN_OR_RAISE(
auto converted, PyValue::Convert(this->primitive_type_,
this->options_, value));
@@ -659,6 +678,10 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T,
FixedSizeBinaryType>::
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
this->primitive_builder_->UnsafeAppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->primitive_type_, this->options_, value,
view_));
@@ -681,6 +704,10 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>>
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
this->primitive_builder_->UnsafeAppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->primitive_type_, this->options_, value,
view_));
@@ -721,6 +748,10 @@ class PyDictionaryConverter<U, enable_if_has_c_type<U>>
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
return this->value_builder_->AppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ return this->value_builder_->AppendScalar(*scalar, 1);
} else {
ARROW_ASSIGN_OR_RAISE(auto converted,
PyValue::Convert(this->value_type_,
this->options_, value));
@@ -736,6 +767,10 @@ class PyDictionaryConverter<U,
enable_if_has_string_view<U>>
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
return this->value_builder_->AppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ return this->value_builder_->AppendScalar(*scalar, 1);
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->value_type_, this->options_, value, view_));
@@ -884,6 +919,10 @@ class PyStructConverter : public
StructConverter<PyConverter, PyConverterTrait>
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
return this->struct_builder_->AppendNull();
+ } else if (arrow::py::is_scalar(value)) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+ arrow::py::unwrap_scalar(value));
+ return this->struct_builder_->AppendScalar(*scalar);
}
switch (input_kind_) {
case InputKind::DICT:
diff --git a/python/pyarrow/tests/test_convert_builtin.py
b/python/pyarrow/tests/test_convert_builtin.py
index 587b4c44d2..af4c91a894 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -2363,3 +2363,141 @@ def test_array_from_pylist_offset_overflow():
assert isinstance(arr, pa.ChunkedArray)
assert len(arr) == 2**31
assert len(arr.chunks) > 1
+
+
+@parametrize_with_collections_types
[email protected](('data', 'scalar_data', 'value_type'), [
+ ([True, False, None], [pa.scalar(True), pa.scalar(False), None],
pa.bool_()),
+ (
+ [1, 2, None],
+ [pa.scalar(1), pa.scalar(2), pa.scalar(None, pa.int64())],
+ pa.int64()
+ ),
+ ([1, None, None], [pa.scalar(1), None, pa.scalar(None, pa.int64())],
pa.int64()),
+ ([None, None], [pa.scalar(None), pa.scalar(None)], pa.null()),
+ ([1., 2., None], [pa.scalar(1.), pa.scalar(2.), None], pa.float64()),
+ (
+ [None, datetime.date.today()],
+ [None, pa.scalar(datetime.date.today())],
+ pa.date32()
+ ),
+ (
+ [None, datetime.date.today()],
+ [None, pa.scalar(datetime.date.today(), pa.date64())],
+ pa.date64()
+ ),
+ (
+ [datetime.time(1, 1, 1), None],
+ [pa.scalar(datetime.time(1, 1, 1)), None],
+ pa.time64('us')
+ ),
+ (
+ [datetime.timedelta(seconds=10)],
+ [pa.scalar(datetime.timedelta(seconds=10))],
+ pa.duration('us')
+ ),
+ (
+ [None, datetime.datetime(2014, 1, 1)],
+ [None, pa.scalar(datetime.datetime(2014, 1, 1))],
+ pa.timestamp('us')
+ ),
+ (
+ [pa.MonthDayNano([1, -1, -10100])],
+ [pa.scalar(pa.MonthDayNano([1, -1, -10100]))],
+ pa.month_day_nano_interval()
+ ),
+ (["a", "b"], [pa.scalar("a"), pa.scalar("b")], pa.string()),
+ ([b"a", b"b"], [pa.scalar(b"a"), pa.scalar(b"b")], pa.binary()),
+ (
+ [b"a", b"b"],
+ [pa.scalar(b"a", pa.binary(1)), pa.scalar(b"b", pa.binary(1))],
+ pa.binary(1)
+ ),
+ ([[1, 2, 3]], [pa.scalar([1, 2, 3])], pa.list_(pa.int64())),
+ ([["a", "b"]], [pa.scalar(["a", "b"])], pa.list_(pa.string())),
+ (
+ [1, 2, None],
+ [pa.scalar(1, type=pa.int8()), pa.scalar(2, type=pa.int8()), None],
+ pa.int8()
+ ),
+ ([1, None], [pa.scalar(1.0, type=pa.int32()), None], pa.int32()),
+ (
+ ["aaa", "bbb"],
+ [pa.scalar("aaa", type=pa.binary(3)), pa.scalar("bbb",
type=pa.binary(3))],
+ pa.binary(3)),
+ ([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()),
+ (["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()),
+ (
+ ["a"],
+ [pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))],
+ pa.dictionary(pa.int64(), pa.string())
+ ),
+ (
+ ["a", "b"],
+ [pa.scalar("a", pa.dictionary(pa.int64(), pa.string())),
+ pa.scalar("b", pa.dictionary(pa.int64(), pa.string()))],
+ pa.dictionary(pa.int64(), pa.string())
+ ),
+ (
+ [1],
+ [pa.scalar(1, type=pa.dictionary(pa.int64(), pa.int32()))],
+ pa.dictionary(pa.int64(), pa.int32())
+ ),
+ (
+ [(1, 2)],
+ [pa.scalar([('a', 1), ('b', 2)], type=pa.struct(
+ [('a', pa.int8()), ('b', pa.int8())]))],
+ pa.struct([('a', pa.int8()), ('b', pa.int8())])
+ ),
+ (
+ [(1, 'bar')],
+ [pa.scalar([('a', 1), ('b', 'bar')], type=pa.struct(
+ [('a', pa.int8()), ('b', pa.string())]))],
+ pa.struct([('a', pa.int8()), ('b', pa.string())])
+ )
+])
+def test_array_accepts_pyarrow_scalar(seq, data, scalar_data, value_type):
+ if type(seq(scalar_data)) == set:
+ pytest.skip("The elements in the set get reordered.")
+ expect = pa.array(data, type=value_type)
+ result = pa.array(seq(scalar_data))
+ assert expect.equals(result)
+
+ result = pa.array(seq(scalar_data), type=value_type)
+ assert expect.equals(result)
+
+
+@parametrize_with_collections_types
+def test_array_accepts_pyarrow_scalar_errors(seq):
+ sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)])
+ with pytest.raises(pa.ArrowInvalid,
+ match="cannot mix scalars with different types"):
+ pa.array(sequence)
+
+ sequence = seq([1, pa.scalar("a"), None])
+ with pytest.raises(pa.ArrowInvalid,
+ match="pyarrow scalars cannot be mixed with other "
+ "Python scalar values currently"):
+ pa.array(sequence)
+
+ sequence = seq([np.float16("0.1"), pa.scalar("a"), None])
+ with pytest.raises(pa.ArrowInvalid,
+ match="pyarrow scalars cannot be mixed with other "
+ "Python scalar values currently"):
+ pa.array(sequence)
+
+ sequence = seq([pa.scalar("a"), np.float16("0.1"), None])
+ with pytest.raises(pa.ArrowInvalid,
+ match="pyarrow scalars cannot be mixed with other "
+ "Python scalar values currently"):
+ pa.array(sequence)
+
+ with pytest.raises(pa.ArrowInvalid,
+ match="Cannot append scalar of type string "
+ "to builder for type int32"):
+ pa.array([pa.scalar("a")], type=pa.int32())
+
+ with pytest.raises(pa.ArrowInvalid,
+ match="Cannot append scalar of type int64 "
+ "to builder for type null"):
+ pa.array([pa.scalar(1)], type=pa.null())