Re: [PR] feat(python): Add user-facing `Array` class [arrow-nanoarrow]

via GitHub Thu, 07 Mar 2024 09:22:45 -0800


jorisvandenbossche commented on code in PR #396:
URL: https://github.com/apache/arrow-nanoarrow/pull/396#discussion_r1516370874



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element

Review Comment:
   Should the scalar class implement the `__arrow_c_array/schema__` dunders? 
(like the underlying CScalar does)



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        prefix = f"Scalar<{self.schema.type.name}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+
+class Array:
+    """The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif not isinstance(device, CDevice):
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArrayStream from non-CPU 
device")
+
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArray from non-CPU device")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        raise ValueError(
+            f"Can't export Array with {self._data.n_arrays} chunks to 
ArrowArray"
+        )
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated."""
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_chunks(self) -> int:
+        """Get the number of chunks in the underlying representation of this 
Array."""
+        return self._data.n_arrays
+
+    @property
+    def chunks(self) -> Iterable:
+        """Iterate over Arrays in the underlying representation that are
+        contiguous in memory.
+        """
+        for array in self._data.arrays:
+            yield Array(array, device=self._device)
+
+    def chunk(self, i):
+        """Extract a single contiguous Array from the underlying 
representation."""
+        return Array(self._data.array(i), device=self._device)
+
+    def to_pyiter(self) -> Iterable:
+        """Iterate over the default Python representation of each element.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.array([1, 2, 3], na.int32())
+        >>> for item in array.to_pyiter():
+        ...     print(item)
+        1
+        2
+        3
+        """
+        return iterator(self)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, k) -> Scalar:
+        scalar = Scalar()
+        scalar._c_scalar = self._data[k]
+        scalar._schema = self.schema
+        scalar._device = self._device
+        return scalar
+
+    def __iter__(self) -> Iterable[Scalar]:
+        for c_scalar in self._data:
+            scalar = Scalar()
+            scalar._c_scalar = c_scalar
+            scalar._schema = self.schema
+            scalar._device = self._device
+            yield scalar
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        n_items = 10
+        lines = [f"Array<{self.schema.type.name}>[{len(self)}]"]

Review Comment:
   ```suggestion
           lines = [f"nanoarrow.Array<{self.schema.type.name}>[{len(self)}]"]
   ```
   
   We also might want to be consistent in including the library's name?



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        prefix = f"Scalar<{self.schema.type.name}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+
+class Array:
+    """The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif not isinstance(device, CDevice):
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArrayStream from non-CPU 
device")
+
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArray from non-CPU device")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        raise ValueError(
+            f"Can't export Array with {self._data.n_arrays} chunks to 
ArrowArray"
+        )
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated."""
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_chunks(self) -> int:
+        """Get the number of chunks in the underlying representation of this 
Array."""
+        return self._data.n_arrays
+
+    @property
+    def chunks(self) -> Iterable:
+        """Iterate over Arrays in the underlying representation that are
+        contiguous in memory.
+        """
+        for array in self._data.arrays:
+            yield Array(array, device=self._device)
+
+    def chunk(self, i):
+        """Extract a single contiguous Array from the underlying 
representation."""
+        return Array(self._data.array(i), device=self._device)
+
+    def to_pyiter(self) -> Iterable:
+        """Iterate over the default Python representation of each element.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.array([1, 2, 3], na.int32())
+        >>> for item in array.to_pyiter():
+        ...     print(item)
+        1
+        2
+        3
+        """
+        return iterator(self)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, k) -> Scalar:
+        scalar = Scalar()
+        scalar._c_scalar = self._data[k]
+        scalar._schema = self.schema
+        scalar._device = self._device
+        return scalar
+
+    def __iter__(self) -> Iterable[Scalar]:
+        for c_scalar in self._data:
+            scalar = Scalar()
+            scalar._c_scalar = c_scalar
+            scalar._schema = self.schema
+            scalar._device = self._device
+            yield scalar
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        n_items = 10
+        lines = [f"Array<{self.schema.type.name}>[{len(self)}]"]

Review Comment:
   Comparing the reprs of Array vs the underlying CArray:
   
   ```
   In [37]: arr = na.array(pa.table({"a": [0,1,2,3], "b": [ 0.1, 0.2, 0.3, 
0.4]}))
   
   In [38]: arr
   Out[38]: 
   Array<STRUCT>[4]
   {'a': 0, 'b': 0.1}
   {'a': 1, 'b': 0.2}
   {'a': 2, 'b': 0.3}
   {'a': 3, 'b': 0.4}
   
   In [39]: list(arr._data.arrays)[0]
   Out[39]: 
   <nanoarrow.c_lib.CArray struct<a: int64, b: double>>
   - length: 4
   - offset: 0
   - null_count: 0
   - buffers: (0,)
   - dictionary: NULL
   - children[2]:
     'a': <nanoarrow.c_lib.CArray int64>
       - length: 4
       - offset: 0
       - null_count: 0
       - buffers: (0, 139632238956864)
       - dictionary: NULL
       - children[0]:
     'b': <nanoarrow.c_lib.CArray double>
       - length: 4
       - offset: 0
       - null_count: 0
       - buffers: (0, 139632238956928)
       - dictionary: NULL
       - children[0]:
   ```
   
   I personally like the `struct<a: int64, b: double>` part better compared to 
`STRUCT` for the type in the repr (and in general might also be nice to keep 
this a bit consistent across the different objects)



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -2198,6 +2250,129 @@ cdef class CArrayStream:
         return _repr_utils.array_stream_repr(self)
 
 
+cdef class CMaterializedArrayStream:
+    cdef CSchema _schema
+    cdef CBuffer _array_ends
+    cdef list _arrays
+    cdef int64_t _capacity_arrays

Review Comment:
   This is not used (anymore)?



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element

Review Comment:
   And is it needed to have an additional class that wraps CScalar? 
   It feels as some unnecessary complexity, but I assume there are some things 
that need to be done in cython (the `__arrow_c_array__` impl), while some other 
things here require the python classes (eg Schema). 
   



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        prefix = f"Scalar<{self.schema.type.name}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+
+class Array:
+    """The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif not isinstance(device, CDevice):
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArrayStream from non-CPU 
device")
+
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArray from non-CPU device")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        raise ValueError(
+            f"Can't export Array with {self._data.n_arrays} chunks to 
ArrowArray"
+        )
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated."""
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_chunks(self) -> int:
+        """Get the number of chunks in the underlying representation of this 
Array."""
+        return self._data.n_arrays
+
+    @property
+    def chunks(self) -> Iterable:
+        """Iterate over Arrays in the underlying representation that are
+        contiguous in memory.
+        """
+        for array in self._data.arrays:
+            yield Array(array, device=self._device)
+
+    def chunk(self, i):
+        """Extract a single contiguous Array from the underlying 
representation."""
+        return Array(self._data.array(i), device=self._device)
+
+    def to_pyiter(self) -> Iterable:
+        """Iterate over the default Python representation of each element.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.array([1, 2, 3], na.int32())
+        >>> for item in array.to_pyiter():
+        ...     print(item)
+        1
+        2
+        3
+        """
+        return iterator(self)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, k) -> Scalar:
+        scalar = Scalar()
+        scalar._c_scalar = self._data[k]
+        scalar._schema = self.schema
+        scalar._device = self._device
+        return scalar
+
+    def __iter__(self) -> Iterable[Scalar]:
+        for c_scalar in self._data:
+            scalar = Scalar()
+            scalar._c_scalar = c_scalar
+            scalar._schema = self.schema
+            scalar._device = self._device
+            yield scalar
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        n_items = 10
+        lines = [f"Array<{self.schema.type.name}>[{len(self)}]"]
+
+        for i, item in enumerate(self):
+            if i >= n_items:
+                break
+            py_repr = repr(item.as_py())
+            if len(py_repr) > width_hint:
+                py_repr = py_repr[: (width_hint - 3)] + "..."
+            lines.append(py_repr)
+
+        n_more_items = len(self) - n_items
+        if n_more_items > 1:
+            lines.append(f"...and {n_more_items} more items")
+        elif n_more_items > 0:
+            lines.append(f"...and {n_more_items} more item")
+
+        return "\n".join(lines)
+
+
+def array(obj, schema=None) -> Array:
+    """Create a nanoarrow Array
+
+    The :class:`Array` class is nanoarrow's high-level in-memory array
+    representation, encompasing the role of PyArrow's ``Array``,
+    ``ChunkedArray``, ``RecordBatch``, and ``Table``. This scope maps
+    to that of a fully-consumed ``ArrowArrayStream`` as represented by
+    the Arrow C Stream interface.
+
+    Note that an :class:`Array` is not necessarily contiguous in memory (i.e.,
+    it may consist of zero or more ``ArrowArray``s).
+
+    Parameters
+    ----------
+    obj : array or array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> na.array([1, 2, 3], na.int32())
+    Array<INT32>[3]
+    1
+    2
+    3
+    """
+    return Array(obj, schema=schema)

Review Comment:
   If the `array` function is essentially just an alias for `Array`, there is 
not that much value to it. But I assume the intent is that later this 
constructor can be expanded?
   
   For example, I assumed that I would be able to do something like
   
   ```
   na.array([1,2,3])
   ```
   
   but that doesn't seem to work (yet)
   



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -50,6 +51,7 @@ from cpython.ref cimport Py_INCREF, Py_DECREF
 from nanoarrow_c cimport *
 from nanoarrow_device_c cimport *
 
+from functools import cached_property

Review Comment:
   Not used here?



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -2198,6 +2250,129 @@ cdef class CArrayStream:
         return _repr_utils.array_stream_repr(self)
 
 
+cdef class CMaterializedArrayStream:
+    cdef CSchema _schema
+    cdef CBuffer _array_ends
+    cdef list _arrays
+    cdef int64_t _capacity_arrays
+    cdef int64_t _total_length
+
+    def __cinit__(self):
+        self._arrays = []
+        self._total_length = 0
+        self._schema = CSchema.allocate()
+        self._array_ends = CBuffer.empty()
+        cdef int code = ArrowBufferAppendInt64(self._array_ends._ptr, 0)
+        Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+
+    cdef _finalize(self):
+        self._array_ends._set_data_type(NANOARROW_TYPE_INT64)
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @property
+    def array_ends(self):
+        return self._array_ends
+
+    cdef int _resolve_chunk(self, const int64_t* sorted_offsets, int64_t 
index, int64_t start_offset_i,
+                           int64_t end_offset_i) noexcept nogil:
+        if start_offset_i >= (end_offset_i - 1):

Review Comment:
   Can you add a bit of documentation here about the keywords?



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -1066,10 +1068,15 @@ cdef class CArray:
         return out
 
     def __getitem__(self, k):
+        self._assert_valid()
+
+        cdef int64_t kint
+
         if not isinstance(k, slice):
-            raise TypeError(
-                f"Can't slice CArray with object of type {type(k).__name__}"
-            )
+            kint = k
+            if kint < 0:
+                kint += self._ptr.length
+            return CScalar(self, kint)

Review Comment:
   Is this needed for CArray? We could also leave this for just the 
higher-level Array object to give scalars?



##########
python/src/nanoarrow/array.py:
##########
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable
+
+from nanoarrow._lib import CDEVICE_CPU, CArray, CDevice, 
CMaterializedArrayStream
+from nanoarrow.c_lib import c_array, c_array_stream
+from nanoarrow.iterator import iterator
+from nanoarrow.schema import Schema
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<INT32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_scalar = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        if self._schema is None:
+            self._schema = Schema(self._c_scalar.schema)
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iterator(self._c_scalar))
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        prefix = f"Scalar<{self.schema.type.name}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+
+class Array:
+    """The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif not isinstance(device, CDevice):
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArrayStream from non-CPU 
device")
+
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        if self._device is not CDEVICE_CPU:
+            raise RuntimeError("Can't export ArrowArray from non-CPU device")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        raise ValueError(
+            f"Can't export Array with {self._data.n_arrays} chunks to 
ArrowArray"
+        )
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated."""
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_chunks(self) -> int:
+        """Get the number of chunks in the underlying representation of this 
Array."""
+        return self._data.n_arrays
+
+    @property
+    def chunks(self) -> Iterable:
+        """Iterate over Arrays in the underlying representation that are
+        contiguous in memory.
+        """
+        for array in self._data.arrays:
+            yield Array(array, device=self._device)
+
+    def chunk(self, i):
+        """Extract a single contiguous Array from the underlying 
representation."""
+        return Array(self._data.array(i), device=self._device)
+
+    def to_pyiter(self) -> Iterable:
+        """Iterate over the default Python representation of each element.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.array([1, 2, 3], na.int32())
+        >>> for item in array.to_pyiter():
+        ...     print(item)
+        1
+        2
+        3
+        """
+        return iterator(self)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, k) -> Scalar:
+        scalar = Scalar()
+        scalar._c_scalar = self._data[k]
+        scalar._schema = self.schema
+        scalar._device = self._device
+        return scalar
+
+    def __iter__(self) -> Iterable[Scalar]:
+        for c_scalar in self._data:
+            scalar = Scalar()
+            scalar._c_scalar = c_scalar
+            scalar._schema = self.schema
+            scalar._device = self._device
+            yield scalar
+
+    def __repr__(self) -> str:
+        width_hint = 80
+        n_items = 10
+        lines = [f"Array<{self.schema.type.name}>[{len(self)}]"]
+
+        for i, item in enumerate(self):
+            if i >= n_items:
+                break
+            py_repr = repr(item.as_py())
+            if len(py_repr) > width_hint:
+                py_repr = py_repr[: (width_hint - 3)] + "..."
+            lines.append(py_repr)
+
+        n_more_items = len(self) - n_items
+        if n_more_items > 1:
+            lines.append(f"...and {n_more_items} more items")
+        elif n_more_items > 0:
+            lines.append(f"...and {n_more_items} more item")
+
+        return "\n".join(lines)
+
+
+def array(obj, schema=None) -> Array:
+    """Create a nanoarrow Array
+
+    The :class:`Array` class is nanoarrow's high-level in-memory array
+    representation, encompasing the role of PyArrow's ``Array``,
+    ``ChunkedArray``, ``RecordBatch``, and ``Table``. This scope maps
+    to that of a fully-consumed ``ArrowArrayStream`` as represented by
+    the Arrow C Stream interface.
+
+    Note that an :class:`Array` is not necessarily contiguous in memory (i.e.,
+    it may consist of zero or more ``ArrowArray``s).
+
+    Parameters
+    ----------
+    obj : array or array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> na.array([1, 2, 3], na.int32())
+    Array<INT32>[3]
+    1
+    2
+    3
+    """
+    return Array(obj, schema=schema)

Review Comment:
   Ah, the error message is a bit confusing:
   
   ```
   In [36]: na.array([1,2,3])
   ...
   TypeError: Can't convert object of type list to nanoarrow.c_array_stream or 
nanoarrow.c_array
   ```
   
   because the above actually does work when specifying a type:
   
   ```
   In [35]: na.array([1,2,3], na.int64())
   Out[35]: 
   Array<INT64>[3]
   1
   2
   3
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add user-facing `Array` class [arrow-nanoarrow]

Reply via email to