Repository: arrow Updated Branches: refs/heads/master 9afb66778 -> ae95dbd18
ARROW-44: Python: prototype object model for array slot values ("scalars") Non-exhaustive, but this will facilitate inspecting Arrow data while the library is in development. ```python In [2]: arr = arrow.from_pylist([['foo', None], None, [], ['qux']]) In [3]: arr Out[3]: <arrow.array.ListArray at 0x7f1970030f98> In [4]: arr[0] Out[4]: ['foo', None] In [5]: type(arr[0]) Out[5]: arrow.scalar.ListValue In [6]: arr[0][0] Out[6]: 'foo' In [7]: arr[0][1] Out[7]: NA In [8]: arr[1] Out[8]: NA In [9]: arr[2] Out[9]: [] In [10]: len(arr[2]) Out[10]: 0 In [11]: arr.type Out[11]: DataType(list<string>) ``` Author: Wes McKinney <w...@apache.org> Closes #20 from wesm/ARROW-44 and squashes the following commits: df06ba1 [Wes McKinney] Add tests for scalars proxying implemented Python list type conversions, fix associated bugs 20fbdc1 [Wes McKinney] Draft scalar box types, no tests yet Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/ae95dbd1 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/ae95dbd1 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/ae95dbd1 Branch: refs/heads/master Commit: ae95dbd189477442d39e55fb0a1aede206906cd9 Parents: 9afb667 Author: Wes McKinney <w...@apache.org> Authored: Mon Mar 7 22:39:07 2016 -0800 Committer: Wes McKinney <w...@apache.org> Committed: Mon Mar 7 22:39:07 2016 -0800 ---------------------------------------------------------------------- cpp/src/arrow/types/list.h | 6 +- python/arrow/__init__.py | 6 +- python/arrow/array.pxd | 1 - python/arrow/array.pyx | 17 ++- python/arrow/compat.py | 6 + python/arrow/includes/arrow.pxd | 36 +++++- python/arrow/scalar.pxd | 25 ++++- python/arrow/scalar.pyx | 165 ++++++++++++++++++++++++++++ python/arrow/schema.pxd | 2 + python/arrow/schema.pyx | 14 +++ python/arrow/tests/test_scalars.py | 82 ++++++++++++++ python/src/pyarrow/adapters/builtin.cc | 2 +- 12 files changed, 342 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/cpp/src/arrow/types/list.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f40a824..210c76a 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -63,7 +63,11 @@ class ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. - const ArrayPtr& values() const {return values_;} + const std::shared_ptr<Array>& values() const {return values_;} + + const std::shared_ptr<DataType>& value_type() const { + return values_->type(); + } const int32_t* offsets() const { return offsets_;} http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index 3c049b8..3507ea0 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -24,7 +24,11 @@ from arrow.array import (Array, from_pylist, total_allocated_bytes, from arrow.error import ArrowException -from arrow.scalar import ArrayValue, NA, Scalar +from arrow.scalar import (ArrayValue, Scalar, NA, NAType, + BooleanValue, + Int8Value, Int16Value, Int32Value, Int64Value, + UInt8Value, UInt16Value, UInt32Value, UInt64Value, + FloatValue, DoubleValue, ListValue, StringValue) from arrow.schema import (null, bool_, int8, int16, int32, int64, http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/array.pxd ---------------------------------------------------------------------- diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index e32d277..04dd8d1 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -34,7 +34,6 @@ cdef class Array: DataType type cdef init(self, const shared_ptr[CArray]& sp_array) - cdef _getitem(self, int i) cdef class BooleanArray(Array): http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/array.pyx ---------------------------------------------------------------------- diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 3a3210d..8ebd01d 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -25,6 +25,7 @@ cimport arrow.includes.pyarrow as pyarrow from arrow.compat import frombytes, tobytes from arrow.error cimport check_status +cimport arrow.scalar as scalar from arrow.scalar import NA def total_allocated_bytes(): @@ -73,13 +74,7 @@ cdef class Array: while key < 0: key += len(self) - if self.ap.IsNull(key): - return NA - else: - return self._getitem(key) - - cdef _getitem(self, int i): - raise NotImplementedError + return scalar.box_arrow_scalar(self.type, self.sp_array, key) def slice(self, start, end): pass @@ -168,12 +163,16 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): return arr -def from_pylist(object list_obj, type=None): +def from_pylist(object list_obj, DataType type=None): """ Convert Python list to Arrow array """ cdef: shared_ptr[CArray] sp_array - check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + if type is None: + check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + else: + raise NotImplementedError + return box_arrow_array(sp_array) http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/compat.py ---------------------------------------------------------------------- diff --git a/python/arrow/compat.py b/python/arrow/compat.py index 2ac41ac..08f0f23 100644 --- a/python/arrow/compat.py +++ b/python/arrow/compat.py @@ -54,6 +54,9 @@ if PY2: range = xrange long = long + def u(s): + return unicode(s, "unicode_escape") + def tobytes(o): if isinstance(o, unicode): return o.encode('utf8') @@ -73,6 +76,9 @@ else: from decimal import Decimal range = range + def u(s): + return s + def tobytes(o): if isinstance(o, str): return o.encode('utf8') http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/includes/arrow.pxd ---------------------------------------------------------------------- diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index fde5de9..0cc44c0 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -84,13 +84,41 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsNull(int i) cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): - pass + uint8_t Value(int i) cdef cppclass CInt8Array" arrow::Int8Array"(CArray): - pass + int8_t Value(int i) + + cdef cppclass CUInt16Array" arrow::UInt16Array"(CArray): + uint16_t Value(int i) + + cdef cppclass CInt16Array" arrow::Int16Array"(CArray): + int16_t Value(int i) + + cdef cppclass CUInt32Array" arrow::UInt32Array"(CArray): + uint32_t Value(int i) + + cdef cppclass CInt32Array" arrow::Int32Array"(CArray): + int32_t Value(int i) + + cdef cppclass CUInt64Array" arrow::UInt64Array"(CArray): + uint64_t Value(int i) + + cdef cppclass CInt64Array" arrow::Int64Array"(CArray): + int64_t Value(int i) + + cdef cppclass CFloatArray" arrow::FloatArray"(CArray): + float Value(int i) + + cdef cppclass CDoubleArray" arrow::DoubleArray"(CArray): + double Value(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): - pass + const int32_t* offsets() + int32_t offset(int i) + int32_t value_length(int i) + const shared_ptr[CArray]& values() + const shared_ptr[CDataType]& value_type() cdef cppclass CStringArray" arrow::StringArray"(CListArray): - pass + c_string GetString(int i) http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/scalar.pxd ---------------------------------------------------------------------- diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd index e193c09..15cdc95 100644 --- a/python/arrow/scalar.pxd +++ b/python/arrow/scalar.pxd @@ -16,7 +16,7 @@ # under the License. from arrow.includes.common cimport * -from arrow.includes.arrow cimport CArray, CListArray +from arrow.includes.arrow cimport * from arrow.schema cimport DataType @@ -31,17 +31,36 @@ cdef class NAType(Scalar): cdef class ArrayValue(Scalar): cdef: - shared_ptr[CArray] array + shared_ptr[CArray] sp_array int index + cdef void init(self, DataType type, + const shared_ptr[CArray]& sp_array, int index) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array) + cdef class Int8Value(ArrayValue): pass -cdef class ListValue(ArrayValue): +cdef class Int64Value(ArrayValue): pass +cdef class ListValue(ArrayValue): + cdef readonly: + DataType value_type + + cdef: + CListArray* ap + + cdef _getitem(self, int i) + + cdef class StringValue(ArrayValue): pass + +cdef object box_arrow_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index) http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx index 78dadec..951ede2 100644 --- a/python/arrow/scalar.pyx +++ b/python/arrow/scalar.pyx @@ -15,14 +15,179 @@ # specific language governing permissions and limitations # under the License. +from arrow.schema cimport DataType, box_data_type + +from arrow.compat import frombytes import arrow.schema as schema +NA = None + cdef class NAType(Scalar): def __cinit__(self): + global NA + if NA is not None: + raise Exception('Cannot create multiple NAType instances') + self.type = schema.null() def __repr__(self): return 'NA' + def as_py(self): + return None + NA = NAType() + +cdef class ArrayValue(Scalar): + + cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array, + int index): + self.type = type + self.index = index + self._set_array(sp_array) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + + def __repr__(self): + if hasattr(self, 'as_py'): + return repr(self.as_py()) + else: + return Scalar.__repr__(self) + + +cdef class BooleanValue(ArrayValue): + pass + + +cdef class Int8Value(ArrayValue): + + def as_py(self): + cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt8Value(ArrayValue): + + def as_py(self): + cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int16Value(ArrayValue): + + def as_py(self): + cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt16Value(ArrayValue): + + def as_py(self): + cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int32Value(ArrayValue): + + def as_py(self): + cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt32Value(ArrayValue): + + def as_py(self): + cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int64Value(ArrayValue): + + def as_py(self): + cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt64Value(ArrayValue): + + def as_py(self): + cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class FloatValue(ArrayValue): + + def as_py(self): + cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class DoubleValue(ArrayValue): + + def as_py(self): + cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get() + return ap.Value(self.index) + + +cdef class StringValue(ArrayValue): + + def as_py(self): + cdef CStringArray* ap = <CStringArray*> self.sp_array.get() + return frombytes(ap.GetString(self.index)) + + +cdef class ListValue(ArrayValue): + + def __len__(self): + return self.ap.value_length(self.index) + + def __getitem__(self, i): + return self._getitem(i) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = <CListArray*> sp_array.get() + self.value_type = box_data_type(self.ap.value_type()) + + cdef _getitem(self, int i): + cdef int j = self.ap.offset(self.index) + i + return box_arrow_scalar(self.value_type, self.ap.values(), j) + + def as_py(self): + cdef: + int j + list result = [] + + for j in range(len(self)): + result.append(self._getitem(j).as_py()) + + return result + + +cdef dict _scalar_classes = { + LogicalType_UINT8: Int8Value, + LogicalType_UINT16: Int16Value, + LogicalType_UINT32: Int32Value, + LogicalType_UINT64: Int64Value, + LogicalType_INT8: Int8Value, + LogicalType_INT16: Int16Value, + LogicalType_INT32: Int32Value, + LogicalType_INT64: Int64Value, + LogicalType_FLOAT: FloatValue, + LogicalType_DOUBLE: DoubleValue, + LogicalType_LIST: ListValue, + LogicalType_STRING: StringValue +} + +cdef object box_arrow_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index): + cdef ArrayValue val + if sp_array.get().IsNull(index): + return NA + else: + val = _scalar_classes[type.type.type]() + val.init(type, sp_array, index) + return val http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/schema.pxd ---------------------------------------------------------------------- diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd index 487c246..8cc244a 100644 --- a/python/arrow/schema.pxd +++ b/python/arrow/schema.pxd @@ -37,3 +37,5 @@ cdef class Schema: cdef: shared_ptr[CSchema] sp_schema CSchema* schema + +cdef DataType box_data_type(const shared_ptr[CDataType]& type) http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/schema.pyx ---------------------------------------------------------------------- diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index 63cd6e8..3001531 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -85,6 +85,14 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True): def field(name, type): return Field(name, type) +cdef set PRIMITIVE_TYPES = set([ + LogicalType_NA, LogicalType_BOOL, + LogicalType_UINT8, LogicalType_INT8, + LogicalType_UINT16, LogicalType_INT16, + LogicalType_UINT32, LogicalType_INT32, + LogicalType_UINT64, LogicalType_INT64, + LogicalType_FLOAT, LogicalType_DOUBLE]) + def null(): return primitive_type(LogicalType_NA) @@ -148,3 +156,9 @@ def struct(fields, c_bool nullable=True): out.init(shared_ptr[CDataType]( new CStructType(c_fields, nullable))) return out + + +cdef DataType box_data_type(const shared_ptr[CDataType]& type): + cdef DataType out = DataType() + out.init(type) + return out http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/tests/test_scalars.py ---------------------------------------------------------------------- diff --git a/python/arrow/tests/test_scalars.py b/python/arrow/tests/test_scalars.py new file mode 100644 index 0000000..951380b --- /dev/null +++ b/python/arrow/tests/test_scalars.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest, u +import arrow + + +class TestScalars(unittest.TestCase): + + def test_null_singleton(self): + with self.assertRaises(Exception): + arrow.NAType() + + def test_bool(self): + pass + + def test_int64(self): + arr = arrow.from_pylist([1, 2, None]) + + v = arr[0] + assert isinstance(v, arrow.Int64Value) + assert repr(v) == "1" + assert v.as_py() == 1 + + assert arr[2] is arrow.NA + + def test_double(self): + arr = arrow.from_pylist([1.5, None, 3]) + + v = arr[0] + assert isinstance(v, arrow.DoubleValue) + assert repr(v) == "1.5" + assert v.as_py() == 1.5 + + assert arr[1] is arrow.NA + + v = arr[2] + assert v.as_py() == 3.0 + + def test_string(self): + arr = arrow.from_pylist(['foo', None, u('bar')]) + + v = arr[0] + assert isinstance(v, arrow.StringValue) + assert repr(v) == "'foo'" + assert v.as_py() == 'foo' + + assert arr[1] is arrow.NA + + v = arr[2].as_py() + assert v == 'bar' + assert isinstance(v, str) + + def test_list(self): + arr = arrow.from_pylist([['foo', None], None, ['bar'], []]) + + v = arr[0] + assert len(v) == 2 + assert isinstance(v, arrow.ListValue) + assert repr(v) == "['foo', None]" + assert v.as_py() == ['foo', None] + assert v[0].as_py() == 'foo' + assert v[1] is arrow.NA + + assert arr[1] is arrow.NA + + v = arr[3] + assert len(v) == 0 http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/src/pyarrow/adapters/builtin.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index ae84fa1..60d6248 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -276,7 +276,7 @@ class Int64Converter : public TypedConverter<arrow::Int64Builder> { class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> { public: Status AppendData(PyObject* seq) override { - int64_t val; + double val; Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i));