Re: [PR] feat(python): Add user-facing `Array` class [arrow-nanoarrow]

via GitHub Thu, 07 Mar 2024 11:59:34 -0800


danepitkin commented on code in PR #396:
URL: https://github.com/apache/arrow-nanoarrow/pull/396#discussion_r1516716810



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -1192,6 +1199,51 @@ cdef class CArray:
         return _repr_utils.array_repr(self)
 
 
+cdef class CScalar:
+    """Low-level scalar implementation
+
+    This is an implementation of a "Scalar" that is a thin
+    wrapper around a (Python) CArray with the ability to
+    materialize a length-one array.

Review Comment:
   Clever! :) 



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        prefix = f"Scalar<{self.schema.type.name}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+
+class Array:
+    """The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif not isinstance(device, CDevice):
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArrayStream from non-CPU 
device")
+
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArray from non-CPU device")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        raise ValueError(
+            f"Can't export Array with {self._data.n_arrays} chunks to 
ArrowArray"
+        )
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated."""
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_chunks(self) -> int:
+        """Get the number of chunks in the underlying representation of this 
Array."""
+        return self._data.n_arrays
+
+    @property
+    def chunks(self) -> Iterable:

Review Comment:
   nit: I think `iter_chunks()` could be a better name and is better defined as 
a function. I typically think of properties as a static value. 



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80

Review Comment:
   Should we make the width_hint configureable via an additional api such as 
`to_string()`? Then `repr` can call `self.to_string(length=80)`



##########
python/src/nanoarrow/_ipc_lib.pyx:
##########
@@ -50,22 +50,32 @@ cdef extern from "nanoarrow_ipc.h" nogil:
 
 
 cdef class PyInputStreamPrivate:
-    cdef object obj
-    cdef object obj_method
-    cdef void* addr
-    cdef Py_ssize_t size_bytes
-    cdef int close_stream
+    cdef object _obj
+    cdef bint _close_stream
+    cdef void* _addr
+    cdef Py_ssize_t _size_bytes
 
     def __cinit__(self, obj, close_stream=False):
-        self.obj = obj
-        self.obj_method = obj.readinto
-        self.addr = NULL
-        self.size_bytes = 0
-        self.close_stream = close_stream
+        self._obj = obj
+        self._close_stream = close_stream
+        self._addr = NULL
+        self._size_bytes = 0
+
+    @property
+    def obj(self):
+        return self._obj
+
+    @property
+    def close_stream(self):

Review Comment:
   Why does this object hold `close_stream` state? Do we not want 
`py_input_stream_release` to always close the stream?



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -2198,6 +2250,129 @@ cdef class CArrayStream:
         return _repr_utils.array_stream_repr(self)
 
 
+cdef class CMaterializedArrayStream:
+    cdef CSchema _schema
+    cdef CBuffer _array_ends
+    cdef list _arrays
+    cdef int64_t _capacity_arrays
+    cdef int64_t _total_length
+
+    def __cinit__(self):
+        self._arrays = []
+        self._total_length = 0
+        self._schema = CSchema.allocate()
+        self._array_ends = CBuffer.empty()
+        cdef int code = ArrowBufferAppendInt64(self._array_ends._ptr, 0)
+        Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+
+    cdef _finalize(self):
+        self._array_ends._set_data_type(NANOARROW_TYPE_INT64)
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @property
+    def array_ends(self):
+        return self._array_ends
+
+    cdef int _resolve_chunk(self, const int64_t* sorted_offsets, int64_t 
index, int64_t start_offset_i,
+                           int64_t end_offset_i) noexcept nogil:
+        if start_offset_i >= (end_offset_i - 1):

Review Comment:
   optional: i challenge you to implement this iteratively instead of 
recursively to save on stack space usage ;) 



##########
python/src/nanoarrow/_ipc_lib.pyx:
##########


Review Comment:
   I think leaving it in is fine!



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        prefix = f"Scalar<{self.schema.type.name}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+
+class Array:
+    """The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif not isinstance(device, CDevice):
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArrayStream from non-CPU 
device")
+
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArray from non-CPU device")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        raise ValueError(
+            f"Can't export Array with {self._data.n_arrays} chunks to 
ArrowArray"
+        )
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated."""
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_chunks(self) -> int:

Review Comment:
   nit: I'd prefer `num_chunks` for naming



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add user-facing `Array` class [arrow-nanoarrow]

Reply via email to