Re: [PR] feat(python): Add user-facing `Array` class [arrow-nanoarrow]

via GitHub Wed, 13 Mar 2024 12:56:50 -0700


danepitkin commented on code in PR #396:
URL: https://github.com/apache/arrow-nanoarrow/pull/396#discussion_r1523836715



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -2198,6 +2200,130 @@ cdef class CArrayStream:
         return _repr_utils.array_stream_repr(self)
 
 
+cdef class CMaterializedArrayStream:
+    cdef CSchema _schema
+    cdef CBuffer _array_ends
+    cdef list _arrays
+    cdef int64_t _total_length
+
+    def __cinit__(self):
+        self._arrays = []
+        self._total_length = 0
+        self._schema = CSchema.allocate()
+        self._array_ends = CBuffer.empty()
+        cdef int code = ArrowBufferAppendInt64(self._array_ends._ptr, 0)
+        Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+
+    cdef _finalize(self):
+        self._array_ends._set_data_type(NANOARROW_TYPE_INT64)
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @property
+    def array_ends(self):
+        return self._array_ends
+
+    def __getitem__(self, k):
+        cdef int64_t kint
+        cdef int array_i
+        cdef const int64_t* sorted_offsets = 
<int64_t*>self._array_ends._ptr.data
+
+        if not isinstance(k, slice):
+            kint = k
+            if kint < 0:
+                kint += self._total_length
+            if kint < 0 or kint >= self._total_length:
+                raise IndexError(f"Index {kint} is out of range")
+
+            array_i = ArrowResolveChunk64(kint, sorted_offsets, 0, 
len(self._arrays))
+            kint -= sorted_offsets[array_i]
+            return self._arrays[array_i], kint
+
+        raise NotImplementedError("index with slice")
+
+    def __len__(self):
+        return self._array_ends[len(self._arrays)]
+
+    def __iter__(self):
+        for c_array in self._arrays:
+            for item_i in range(c_array.length):
+                yield c_array, item_i
+
+    def array(self, int64_t i):
+        return self._arrays[i]
+
+    @property
+    def n_arrays(self):

Review Comment:
   ```suggestion
       def num_arrays(self):
   ```
   
   just if we want to be consistent with `num_chunks`



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,498 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable, Tuple
+
+from nanoarrow._lib import (
+    CDEVICE_CPU,
+    CArray,
+    CBuffer,
+    CDevice,
+    CMaterializedArrayStream,
+)
+from nanoarrow.c_lib import c_array, c_array_stream, c_array_view
+from nanoarrow.iterator import iter_py, iter_tuples
+from nanoarrow.schema import Schema
+
+from nanoarrow import _repr_utils
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.Array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<int32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_array = None
+        self._offset = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iter_py(self))
+
+    def to_string(self, width_hint=80) -> str:
+        c_schema_string = _repr_utils.c_schema_to_string(
+            self._c_array.schema, width_hint // 4
+        )
+
+        prefix = f"Scalar<{c_schema_string}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+    def __repr__(self) -> str:
+        return self.to_string()
+
+    def __arrow_c_array__(self, requested_schema=None):
+        array = self._c_array[self._offset : (self._offset + 1)]
+        return array.__arrow_c_array__(requested_schema=requested_schema)
+
+
+class Array:
+    """High-level in-memory Array representation
+
+    The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+
+    The :class:`Array` class is nanoarrow's high-level in-memory array
+    representation, encompasing the role of PyArrow's ``Array``,
+    ``ChunkedArray``, ``RecordBatch``, and ``Table``. This scope maps
+    to that of a fully-consumed ``ArrowArrayStream`` as represented by
+    the Arrow C Stream interface.
+
+    Note that an :class:`Array` is not necessarily contiguous in memory (i.e.,
+    it may consist of zero or more ``ArrowArray``s).
+
+    Parameters
+    ----------
+    obj : array or array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+    device : CDevice, optional
+        The device associated with the buffers held by this Array.
+        Defaults to the CPU device.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> na.Array([1, 2, 3], na.int32())
+    nanoarrow.Array<int32>[3]
+    1
+    2
+    3
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif isinstance(device, CDevice):
+            self._device = device
+        else:
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, CMaterializedArrayStream) and schema is None:
+            self._data = obj
+            return
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def _assert_one_chunk(self, op):
+        if self._data.n_arrays != 1:
+            raise ValueError(f"Can't {op} with non-contiguous Array")
+
+    def _assert_cpu(self, op):
+        if self._device != CDEVICE_CPU:
+            raise ValueError(f"Can't {op} with Array on non-CPU device")
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        self._assert_cpu("export ArrowArrayStream")
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        self._assert_cpu("export ArrowArray")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        self._assert_one_chunk("export ArrowArray")
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.device
+        <nanoarrow.device.CDevice>
+        - device_type: 1
+        - device_id: 0
+        """
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_buffers(self) -> int:

Review Comment:
   ```suggestion
       def num_buffers(self) -> int:
   ```
   
   You should consider my `n` vs `num` suggestion optional, but I do think its 
best to be consistent. I personally try to avoid as much ambiguity when naming 
things, hence my minor preference for `num`.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add user-facing `Array` class [arrow-nanoarrow]

Reply via email to