[
https://issues.apache.org/jira/browse/ARROW-1646?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16342635#comment-16342635
]
ASF GitHub Bot commented on ARROW-1646:
---------------------------------------
xhochy closed pull request #1475: ARROW-1646: [Python] Handle NumPy scalar types
URL: https://github.com/apache/arrow/pull/1475
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/python/builtin_convert.cc
b/cpp/src/arrow/python/builtin_convert.cc
index cd88d557d..31f3549ef 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -32,6 +32,7 @@
#include "arrow/util/logging.h"
#include "arrow/python/helpers.h"
+#include "arrow/python/numpy_convert.h"
#include "arrow/python/util/datetime.h"
namespace arrow {
@@ -93,6 +94,21 @@ class ScalarVisitor {
++binary_count_;
} else if (PyUnicode_Check(obj)) {
++unicode_count_;
+ } else if (PyArray_CheckAnyScalarExact(obj)) {
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
+ if (is_integer(type->id())) {
+ ++int_count_;
+ } else if (is_floating(type->id())) {
+ ++float_count_;
+ } else if (type->id() == Type::TIMESTAMP) {
+ ++timestamp_count_;
+ } else {
+ std::ostringstream ss;
+ ss << "Found a NumPy scalar with Arrow dtype that we cannot handle: ";
+ ss << type->ToString();
+ return Status::Invalid(ss.str());
+ }
} else {
// TODO(wesm): accumulate error information somewhere
static std::string supported_types =
@@ -575,6 +591,24 @@ class TimestampConverter
t = PyDateTime_to_ns(pydatetime);
break;
}
+ } else if (PyArray_CheckAnyScalarExact(item.obj())) {
+ // numpy.datetime64
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(item.obj()),
&type));
+ if (type->id() != Type::TIMESTAMP) {
+ std::ostringstream ss;
+ ss << "Expected np.datetime64 but got: ";
+ ss << type->ToString();
+ return Status::Invalid(ss.str());
+ }
+ const TimestampType& ttype = static_cast<const TimestampType&>(*type);
+ if (unit_ != ttype.unit()) {
+ return Status::NotImplemented(
+ "Cannot convert NumPy datetime64 objects with differing unit");
+ }
+
+ PyDatetimeScalarObject* obj =
reinterpret_cast<PyDatetimeScalarObject*>(item.obj());
+ t = obj->obval;
} else {
t = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
RETURN_IF_PYERROR();
diff --git a/cpp/src/arrow/python/numpy_convert.cc
b/cpp/src/arrow/python/numpy_convert.cc
index 9ed2d73d4..3eba65765 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -146,7 +146,10 @@ Status GetNumPyType(const DataType& type, int* type_num) {
Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) {
PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
+ return NumPyDtypeToArrow(descr, out);
+}
+Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out)
{
int type_num = cast_npy_type_compat(descr->type_num);
switch (type_num) {
diff --git a/cpp/src/arrow/python/numpy_convert.h
b/cpp/src/arrow/python/numpy_convert.h
index 93c484892..220e38f2e 100644
--- a/cpp/src/arrow/python/numpy_convert.h
+++ b/cpp/src/arrow/python/numpy_convert.h
@@ -56,6 +56,8 @@ bool is_contiguous(PyObject* array);
ARROW_EXPORT
Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
+ARROW_EXPORT
+Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out);
Status GetNumPyType(const DataType& type, int* type_num);
diff --git a/cpp/src/arrow/python/numpy_interop.h
b/cpp/src/arrow/python/numpy_interop.h
index b93200cc8..8c569e232 100644
--- a/cpp/src/arrow/python/numpy_interop.h
+++ b/cpp/src/arrow/python/numpy_interop.h
@@ -40,6 +40,7 @@
#endif
#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
#include <numpy/ufuncobject.h>
namespace arrow {
diff --git a/python/pyarrow/tests/test_convert_builtin.py
b/python/pyarrow/tests/test_convert_builtin.py
index d7760da2f..fa603b1a9 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -23,6 +23,8 @@
import datetime
import decimal
+import numpy as np
+import six
class StrangeIterable:
@@ -33,356 +35,453 @@ def __iter__(self):
return self.lst.__iter__()
-class TestConvertIterable(unittest.TestCase):
-
- def test_iterable_types(self):
- arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
- arr2 = pa.array((0, 1, 2, 3))
-
- assert arr1.equals(arr2)
-
- def test_empty_iterable(self):
- arr = pa.array(StrangeIterable([]))
- assert len(arr) == 0
- assert arr.null_count == 0
- assert arr.type == pa.null()
- assert arr.to_pylist() == []
-
-
-class TestLimitedConvertIterator(unittest.TestCase):
- def test_iterator_types(self):
- arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
- arr2 = pa.array((0, 1, 2))
- assert arr1.equals(arr2)
-
- def test_iterator_size_overflow(self):
- arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
- arr2 = pa.array((0, 1))
- assert arr1.equals(arr2)
-
- def test_iterator_size_underflow(self):
- arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
- arr2 = pa.array((0, 1, 2))
- assert arr1.equals(arr2)
-
-
-class TestConvertSequence(unittest.TestCase):
-
- def test_sequence_types(self):
- arr1 = pa.array([1, 2, 3])
- arr2 = pa.array((1, 2, 3))
-
- assert arr1.equals(arr2)
-
- def test_boolean(self):
- expected = [True, None, False, None]
- arr = pa.array(expected)
- assert len(arr) == 4
- assert arr.null_count == 2
- assert arr.type == pa.bool_()
- assert arr.to_pylist() == expected
-
- def test_empty_list(self):
- arr = pa.array([])
- assert len(arr) == 0
- assert arr.null_count == 0
- assert arr.type == pa.null()
- assert arr.to_pylist() == []
-
- def test_all_none(self):
- arr = pa.array([None, None])
- assert len(arr) == 2
- assert arr.null_count == 2
- assert arr.type == pa.null()
- assert arr.to_pylist() == [None, None]
-
- def test_integer(self):
- expected = [1, None, 3, None]
- arr = pa.array(expected)
- assert len(arr) == 4
- assert arr.null_count == 2
- assert arr.type == pa.int64()
- assert arr.to_pylist() == expected
-
- def test_garbage_collection(self):
- import gc
-
- # Force the cyclic garbage collector to run
- gc.collect()
-
- bytes_before = pa.total_allocated_bytes()
- pa.array([1, None, 3, None])
- gc.collect()
- assert pa.total_allocated_bytes() == bytes_before
-
- def test_double(self):
- data = [1.5, 1, None, 2.5, None, None]
- arr = pa.array(data)
- assert len(arr) == 6
- assert arr.null_count == 3
- assert arr.type == pa.float64()
- assert arr.to_pylist() == data
-
- def test_unicode(self):
- data = [u'foo', u'bar', None, u'mañana']
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.string()
- assert arr.to_pylist() == data
-
- def test_bytes(self):
- u1 = b'ma\xc3\xb1ana'
- data = [b'foo',
- u1.decode('utf-8'), # unicode gets encoded,
- None]
- arr = pa.array(data)
- assert len(arr) == 3
- assert arr.null_count == 1
- assert arr.type == pa.binary()
- assert arr.to_pylist() == [b'foo', u1, None]
-
- def test_utf8_to_unicode(self):
- # ARROW-1225
- data = [b'foo', None, b'bar']
- arr = pa.array(data, type=pa.string())
- assert arr[0].as_py() == u'foo'
-
- # test a non-utf8 unicode string
- val = (u'mañana').encode('utf-16-le')
- with pytest.raises(pa.ArrowException):
- pa.array([val], type=pa.string())
-
- def test_fixed_size_bytes(self):
- data = [b'foof', None, b'barb', b'2346']
- arr = pa.array(data, type=pa.binary(4))
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.binary(4)
- assert arr.to_pylist() == data
-
- def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
- data = [b'foo', None, b'barb', b'2346']
- with self.assertRaises(pa.ArrowInvalid):
- pa.array(data, type=pa.binary(4))
-
- def test_date(self):
- data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
- datetime.date(2040, 2, 26)]
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.type == pa.date64()
- assert arr.null_count == 1
- assert arr[0].as_py() == datetime.date(2000, 1, 1)
- assert arr[1].as_py() is None
- assert arr[2].as_py() == datetime.date(1970, 1, 1)
- assert arr[3].as_py() == datetime.date(2040, 2, 26)
-
- def test_date32(self):
- data = [datetime.date(2000, 1, 1), None]
- arr = pa.array(data, type=pa.date32())
-
- data2 = [10957, None]
- arr2 = pa.array(data2, type=pa.date32())
-
- for x in [arr, arr2]:
- assert len(x) == 2
- assert x.type == pa.date32()
- assert x.null_count == 1
- assert x[0].as_py() == datetime.date(2000, 1, 1)
- assert x[1] is pa.NA
-
- # Overflow
- data3 = [2**32, None]
- with pytest.raises(pa.ArrowException):
- pa.array(data3, type=pa.date32())
-
- def test_timestamp(self):
- data = [
- datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
- None,
- datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
- datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
- ]
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.type == pa.timestamp('us')
- assert arr.null_count == 1
- assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123456)
- assert arr[1].as_py() is None
- assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
- 34, 56, 432539)
- assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
- 46, 57, 437699)
-
- def test_timestamp_with_unit(self):
- data = [
- datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
- ]
-
- s = pa.timestamp('s')
- ms = pa.timestamp('ms')
- us = pa.timestamp('us')
- ns = pa.timestamp('ns')
-
- arr_s = pa.array(data, type=s)
- assert len(arr_s) == 1
- assert arr_s.type == s
- assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 0)
-
- arr_ms = pa.array(data, type=ms)
- assert len(arr_ms) == 1
- assert arr_ms.type == ms
- assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123000)
-
- arr_us = pa.array(data, type=us)
- assert len(arr_us) == 1
- assert arr_us.type == us
- assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123456)
-
- arr_ns = pa.array(data, type=ns)
- assert len(arr_ns) == 1
- assert arr_ns.type == ns
- assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123456)
-
- def test_timestamp_from_int_with_unit(self):
- data = [1]
-
- s = pa.timestamp('s')
- ms = pa.timestamp('ms')
- us = pa.timestamp('us')
- ns = pa.timestamp('ns')
-
- arr_s = pa.array(data, type=s)
- assert len(arr_s) == 1
- assert arr_s.type == s
- assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"
-
- arr_ms = pa.array(data, type=ms)
- assert len(arr_ms) == 1
- assert arr_ms.type == ms
- assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"
-
- arr_us = pa.array(data, type=us)
- assert len(arr_us) == 1
- assert arr_us.type == us
- assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"
-
- arr_ns = pa.array(data, type=ns)
- assert len(arr_ns) == 1
- assert arr_ns.type == ns
- assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"
-
- with pytest.raises(pa.ArrowException):
- class CustomClass():
- pass
- pa.array([1, CustomClass()], type=ns)
- pa.array([1, CustomClass()], type=pa.date32())
- pa.array([1, CustomClass()], type=pa.date64())
-
- def test_mixed_nesting_levels(self):
- pa.array([1, 2, None])
- pa.array([[1], [2], None])
- pa.array([[1], [2], [None]])
-
- with self.assertRaises(pa.ArrowInvalid):
- pa.array([1, 2, [1]])
-
- with self.assertRaises(pa.ArrowInvalid):
- pa.array([1, 2, []])
-
- with self.assertRaises(pa.ArrowInvalid):
- pa.array([[1], [2], [None, [1]]])
-
- def test_list_of_int(self):
- data = [[1, 2, 3], [], None, [1, 2]]
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.list_(pa.int64())
- assert arr.to_pylist() == data
-
- def test_mixed_types_fails(self):
- data = ['a', 1, 2.0]
- with self.assertRaises(pa.ArrowException):
- pa.array(data)
-
- def test_mixed_types_with_specified_type_fails(self):
- data = ['-10', '-5', {'a': 1}, '0', '5', '10']
-
- type = pa.string()
- with self.assertRaises(pa.ArrowInvalid):
- pa.array(data, type=type)
-
- def test_decimal(self):
- data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
- type = pa.decimal128(precision=7, scale=3)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_different_precisions(self):
- data = [
- decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
- ]
- type = pa.decimal128(precision=13, scale=3)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_no_scale(self):
- data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
- type = pa.decimal128(precision=10)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_negative(self):
- data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
- type = pa.decimal128(precision=10, scale=6)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_no_whole_part(self):
- data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
- type = pa.decimal128(precision=7, scale=7)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_large_integer(self):
- data = [decimal.Decimal('-394029506937548693.42983'),
- decimal.Decimal('32358695912932.01033')]
- type = pa.decimal128(precision=23, scale=5)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_range_types(self):
- arr1 = pa.array(range(3))
- arr2 = pa.array((0, 1, 2))
- assert arr1.equals(arr2)
-
- def test_empty_range(self):
- arr = pa.array(range(0))
- assert len(arr) == 0
- assert arr.null_count == 0
- assert arr.type == pa.null()
- assert arr.to_pylist() == []
-
- def test_structarray(self):
- ints = pa.array([None, 2, 3], type=pa.int64())
- strs = pa.array([u'a', None, u'c'], type=pa.string())
- bools = pa.array([True, False, None], type=pa.bool_())
- arr = pa.StructArray.from_arrays(
- ['ints', 'strs', 'bools'],
- [ints, strs, bools])
-
- expected = [
- {'ints': None, 'strs': u'a', 'bools': True},
- {'ints': 2, 'strs': None, 'bools': False},
- {'ints': 3, 'strs': u'c', 'bools': None},
- ]
-
- pylist = arr.to_pylist()
- assert pylist == expected, (pylist, expected)
+def test_iterable_types():
+ arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
+ arr2 = pa.array((0, 1, 2, 3))
+
+ assert arr1.equals(arr2)
+
+
+def test_empty_iterable():
+ arr = pa.array(StrangeIterable([]))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+def test_limited_iterator_types():
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+
+def test_limited_iterator_size_overflow():
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
+ arr2 = pa.array((0, 1))
+ assert arr1.equals(arr2)
+
+
+def test_limited_iterator_size_underflow():
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+
+def _as_list(xs):
+ return xs
+
+
+def _as_tuple(xs):
+ return tuple(xs)
+
+
+def _as_dict_values(xs):
+ dct = {k: v for k, v in enumerate(xs)}
+ return six.viewvalues(dct)
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_types(seq):
+ arr1 = pa.array(seq([1, 2, 3]))
+ arr2 = pa.array([1, 2, 3])
+
+ assert arr1.equals(arr2)
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_boolean(seq):
+ expected = [True, None, False, None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.bool_()
+ assert arr.to_pylist() == expected
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_numpy_boolean(seq):
+ expected = [np.bool(True), None, np.bool(False), None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.bool_()
+ assert arr.to_pylist() == expected
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_empty_list(seq):
+ arr = pa.array(seq([]))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+def test_sequence_all_none():
+ arr = pa.array([None, None])
+ assert len(arr) == 2
+ assert arr.null_count == 2
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == [None, None]
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_integer(seq):
+ expected = [1, None, 3, None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.int64()
+ assert arr.to_pylist() == expected
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
[email protected]("np_scalar", [np.int16, np.int32, np.int64, np.uint16,
+ np.uint32, np.uint64])
+def test_sequence_numpy_integer(seq, np_scalar):
+ expected = [np_scalar(1), None, np_scalar(3), None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.int64()
+ assert arr.to_pylist() == expected
+
+
+def test_garbage_collection():
+ import gc
+
+ # Force the cyclic garbage collector to run
+ gc.collect()
+
+ bytes_before = pa.total_allocated_bytes()
+ pa.array([1, None, 3, None])
+ gc.collect()
+ assert pa.total_allocated_bytes() == bytes_before
+
+
+def test_sequence_double():
+ data = [1.5, 1, None, 2.5, None, None]
+ arr = pa.array(data)
+ assert len(arr) == 6
+ assert arr.null_count == 3
+ assert arr.type == pa.float64()
+ assert arr.to_pylist() == data
+
+
[email protected]("seq", [_as_list, _as_tuple, _as_dict_values])
[email protected]("np_scalar", [np.float16, np.float32, np.float64])
+def test_sequence_numpy_double(seq, np_scalar):
+ data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, None]
+ arr = pa.array(seq(data))
+ assert len(arr) == 6
+ assert arr.null_count == 3
+ assert arr.type == pa.float64()
+ assert arr.to_pylist() == data
+
+
+def test_sequence_unicode():
+ data = [u'foo', u'bar', None, u'mañana']
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.string()
+ assert arr.to_pylist() == data
+
+
+def test_sequence_bytes():
+ u1 = b'ma\xc3\xb1ana'
+ data = [b'foo',
+ u1.decode('utf-8'), # unicode gets encoded,
+ None]
+ arr = pa.array(data)
+ assert len(arr) == 3
+ assert arr.null_count == 1
+ assert arr.type == pa.binary()
+ assert arr.to_pylist() == [b'foo', u1, None]
+
+
+def test_sequence_utf8_to_unicode():
+ # ARROW-1225
+ data = [b'foo', None, b'bar']
+ arr = pa.array(data, type=pa.string())
+ assert arr[0].as_py() == u'foo'
+
+ # test a non-utf8 unicode string
+ val = (u'mañana').encode('utf-16-le')
+ with pytest.raises(pa.ArrowException):
+ pa.array([val], type=pa.string())
+
+
+def test_sequence_fixed_size_bytes():
+ data = [b'foof', None, b'barb', b'2346']
+ arr = pa.array(data, type=pa.binary(4))
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.binary(4)
+ assert arr.to_pylist() == data
+
+
+def test_fixed_size_bytes_does_not_accept_varying_lengths():
+ data = [b'foo', None, b'barb', b'2346']
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array(data, type=pa.binary(4))
+
+
+def test_sequence_date():
+ data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
+ datetime.date(2040, 2, 26)]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.type == pa.date64()
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.date(2000, 1, 1)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.date(1970, 1, 1)
+ assert arr[3].as_py() == datetime.date(2040, 2, 26)
+
+
+def test_sequence_date32():
+ data = [datetime.date(2000, 1, 1), None]
+ arr = pa.array(data, type=pa.date32())
+
+ data2 = [10957, None]
+ arr2 = pa.array(data2, type=pa.date32())
+
+ for x in [arr, arr2]:
+ assert len(x) == 2
+ assert x.type == pa.date32()
+ assert x.null_count == 1
+ assert x[0].as_py() == datetime.date(2000, 1, 1)
+ assert x[1] is pa.NA
+
+ # Overflow
+ data3 = [2**32, None]
+ with pytest.raises(pa.ArrowException):
+ pa.array(data3, type=pa.date32())
+
+
+def test_sequence_timestamp():
+ data = [
+ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
+ None,
+ datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
+ datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
+ ]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.type == pa.timestamp('us')
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
+ 34, 56, 432539)
+ assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
+ 46, 57, 437699)
+
+
+def test_sequence_numpy_timestamp():
+ data = [
+ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
+ None,
+ np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
+ np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
+ ]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.type == pa.timestamp('us')
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
+ 34, 56, 432539)
+ assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
+ 46, 57, 437699)
+
+
+def test_sequence_timestamp_with_unit():
+ data = [
+ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
+ ]
+
+ s = pa.timestamp('s')
+ ms = pa.timestamp('ms')
+ us = pa.timestamp('us')
+ ns = pa.timestamp('ns')
+
+ arr_s = pa.array(data, type=s)
+ assert len(arr_s) == 1
+ assert arr_s.type == s
+ assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 0)
+
+ arr_ms = pa.array(data, type=ms)
+ assert len(arr_ms) == 1
+ assert arr_ms.type == ms
+ assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123000)
+
+ arr_us = pa.array(data, type=us)
+ assert len(arr_us) == 1
+ assert arr_us.type == us
+ assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+
+ arr_ns = pa.array(data, type=ns)
+ assert len(arr_ns) == 1
+ assert arr_ns.type == ns
+ assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+
+
+def test_sequence_timestamp_from_int_with_unit():
+ data = [1]
+
+ s = pa.timestamp('s')
+ ms = pa.timestamp('ms')
+ us = pa.timestamp('us')
+ ns = pa.timestamp('ns')
+
+ arr_s = pa.array(data, type=s)
+ assert len(arr_s) == 1
+ assert arr_s.type == s
+ assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"
+
+ arr_ms = pa.array(data, type=ms)
+ assert len(arr_ms) == 1
+ assert arr_ms.type == ms
+ assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"
+
+ arr_us = pa.array(data, type=us)
+ assert len(arr_us) == 1
+ assert arr_us.type == us
+ assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"
+
+ arr_ns = pa.array(data, type=ns)
+ assert len(arr_ns) == 1
+ assert arr_ns.type == ns
+ assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"
+
+ with pytest.raises(pa.ArrowException):
+ class CustomClass():
+ pass
+ pa.array([1, CustomClass()], type=ns)
+ pa.array([1, CustomClass()], type=pa.date32())
+ pa.array([1, CustomClass()], type=pa.date64())
+
+
+def test_sequence_mixed_nesting_levels():
+ pa.array([1, 2, None])
+ pa.array([[1], [2], None])
+ pa.array([[1], [2], [None]])
+
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array([1, 2, [1]])
+
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array([1, 2, []])
+
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array([[1], [2], [None, [1]]])
+
+
+def test_sequence_list_of_int():
+ data = [[1, 2, 3], [], None, [1, 2]]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.list_(pa.int64())
+ assert arr.to_pylist() == data
+
+
+def test_sequence_mixed_types_fails():
+ data = ['a', 1, 2.0]
+ with pytest.raises(pa.ArrowException):
+ pa.array(data)
+
+
+def test_sequence_mixed_types_with_specified_type_fails():
+ data = ['-10', '-5', {'a': 1}, '0', '5', '10']
+
+ type = pa.string()
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array(data, type=type)
+
+
+def test_sequence_decimal():
+ data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
+ type = pa.decimal128(precision=7, scale=3)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_different_precisions():
+ data = [
+ decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
+ ]
+ type = pa.decimal128(precision=13, scale=3)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_no_scale():
+ data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
+ type = pa.decimal128(precision=10)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_negative():
+ data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
+ type = pa.decimal128(precision=10, scale=6)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_no_whole_part():
+ data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
+ type = pa.decimal128(precision=7, scale=7)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_large_integer():
+ data = [decimal.Decimal('-394029506937548693.42983'),
+ decimal.Decimal('32358695912932.01033')]
+ type = pa.decimal128(precision=23, scale=5)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_range_types():
+ arr1 = pa.array(range(3))
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+
+def test_empty_range():
+ arr = pa.array(range(0))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+def test_structarray():
+ ints = pa.array([None, 2, 3], type=pa.int64())
+ strs = pa.array([u'a', None, u'c'], type=pa.string())
+ bools = pa.array([True, False, None], type=pa.bool_())
+ arr = pa.StructArray.from_arrays(
+ ['ints', 'strs', 'bools'],
+ [ints, strs, bools])
+
+ expected = [
+ {'ints': None, 'strs': u'a', 'bools': True},
+ {'ints': 2, 'strs': None, 'bools': False},
+ {'ints': 3, 'strs': u'c', 'bools': None},
+ ]
+
+ pylist = arr.to_pylist()
+ assert pylist == expected, (pylist, expected)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] pyarrow.array cannot handle NumPy scalar types
> -------------------------------------------------------
>
> Key: ARROW-1646
> URL: https://issues.apache.org/jira/browse/ARROW-1646
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Affects Versions: 0.7.1
> Reporter: Wes McKinney
> Assignee: Uwe L. Korn
> Priority: Major
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> Example repro
> {code}
> In [1]: import pyarrow as pa
> impo
> In [2]: import numpy as np
> In [3]: pa.array([np.random.randint(0, 10, size=5), None])
> ---------------------------------------------------------------------------
> ArrowInvalid Traceback (most recent call last)
> <ipython-input-3-b364fa5d75aa> in <module>()
> ----> 1 pa.array([np.random.randint(0, 10, size=5), None])
> /home/wesm/code/arrow/python/pyarrow/array.pxi in pyarrow.lib.array
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:24892)()
> 171 if mask is not None:
> 172 raise ValueError("Masks only supported with ndarray-like
> inputs")
> --> 173 return _sequence_to_array(obj, size, type, pool)
> 174
> 175
> /home/wesm/code/arrow/python/pyarrow/array.pxi in
> pyarrow.lib._sequence_to_array
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:23496)()
> 23 if type is None:
> 24 with nogil:
> ---> 25 check_status(ConvertPySequence(sequence, pool, &out))
> 26 else:
> 27 if size is None:
> /home/wesm/code/arrow/python/pyarrow/error.pxi in pyarrow.lib.check_status
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:7876)()
> 75 message = frombytes(status.message())
> 76 if status.IsInvalid():
> ---> 77 raise ArrowInvalid(message)
> 78 elif status.IsIOError():
> 79 raise ArrowIOError(message)
> ArrowInvalid:
> /home/wesm/code/arrow/cpp/src/arrow/python/builtin_convert.cc:740 code:
> InferArrowTypeAndSize(obj, &size, &type)
> /home/wesm/code/arrow/cpp/src/arrow/python/builtin_convert.cc:319 code:
> InferArrowType(obj, out_type)
> /home/wesm/code/arrow/cpp/src/arrow/python/builtin_convert.cc:299 code:
> seq_visitor.Visit(obj)
> /home/wesm/code/arrow/cpp/src/arrow/python/builtin_convert.cc:180 code:
> VisitElem(ref, level)
> Error inferring Arrow data type for collection of Python objects. Got Python
> object of type ndarray but can only handle these types: bool, float, integer,
> date, datetime, bytes, unicode
> {code}
> If these inner values are converted to Python built-in int types then it
> works fine
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)