[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #62: [Python] Basic Array class wrapping C struct (with conversion to numpy)

GitBox Tue, 25 Oct 2022 11:28:52 -0700


paleolimbot commented on code in PR #62:
URL: https://github.com/apache/arrow-nanoarrow/pull/62#discussion_r1004816765



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -44,14 +47,82 @@ cdef dict _numpy_type_map = {
 }
 
 
-def as_numpy_array(arr):
-    cdef ArrowSchema schema
-    cdef ArrowArray array
+cdef class Array:

Review Comment:
   Semantics, but what you have here is more like a wrapper around the 
`ArrowArrayView` (that keeps the objects it points to alive). The R package has 
one of these too (and similarly it backs all the conversions to R vectors).



##########
python/tests/test_nanoarrow.py:
##########
@@ -6,22 +8,64 @@
 import pytest
 
 
-def test_as_numpy_array():
-    
-    arr = pa.array([1, 2, 3])
-    result = nanoarrow.as_numpy_array(arr)
-    expected = arr.to_numpy()
+def test_array_from_pyarrow():
+    parr = pa.array([1, 2, 3])
+    result = nanoarrow.Array.from_pyarrow(parr)
+    assert result.format == "l"
+
+
+def test_array_to_numpy_lifetime():
+
+    parr = pa.array([1, 2, 3])
+    arr = nanoarrow.Array.from_pyarrow(parr)
+    refcount = sys.getrefcount(arr)
+    result = arr.to_numpy()
+    assert sys.getrefcount(arr) > refcount
+    assert result.base is arr
+    del arr
+    result
+    assert result.base
+
+
+def test_array_to_numpy():
+    parr = pa.array([1, 2, 3])
+    arr = nanoarrow.Array.from_pyarrow(parr)
+    result = arr.to_numpy()
+    expected = parr.to_numpy()
     np.testing.assert_array_equal(result, expected)
 
-    arr = pa.array([1, 2, 3], pa.uint8())
-    result = nanoarrow.as_numpy_array(arr)
-    expected = arr.to_numpy()
+    parr = pa.array([1, 2, 3], pa.uint8())
+    arr = nanoarrow.Array.from_pyarrow(parr)
+    result = arr.to_numpy()
+    expected = parr.to_numpy()
     np.testing.assert_array_equal(result, expected)
 
-    arr = pa.array([1, 2, None])
+    arr = nanoarrow.Array.from_pyarrow(pa.array([1, 2, None]))
     with pytest.raises(ValueError, match="Cannot convert array with nulls"):
-        nanoarrow.as_numpy_array(arr)
+        arr.to_numpy()
 
-    arr = pa.array([[1], [2, 3]])
+    arr = nanoarrow.Array.from_pyarrow(pa.array([[1], [2, 3]]))
     with pytest.raises(TypeError, match="Cannot convert a non-primitive 
array"):
-        nanoarrow.as_numpy_array(arr)
+       arr.to_numpy()
+
+
+def test_from_external_pointers():
+    pytest.importorskip("pyarrow.cffi")
+
+    from pyarrow.cffi import ffi
+
+    c_schema = ffi.new("struct ArrowSchema*")
+    ptr_schema = int(ffi.cast("uintptr_t", c_schema))
+    c_array = ffi.new("struct ArrowArray*")
+    ptr_array = int(ffi.cast("uintptr_t", c_array))
+
+    typ = pa.int32()
+    parr = pa.array([1, 2, 3], type=typ)
+    parr._export_to_c(ptr_array, ptr_schema)
+
+    arr = nanoarrow.Array.from_pointers(ptr_array, ptr_schema)
+    assert arr.to_numpy().tolist() == [1, 2, 3]
+
+    # trying to import second time should not cause a segfault? To enable

Review Comment:
   My reading of cython code is not awesome, but I think importing twice causes 
a double free when the `release()` method to be called twice?



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -44,14 +47,82 @@ cdef dict _numpy_type_map = {
 }
 
 
-def as_numpy_array(arr):
-    cdef ArrowSchema schema
-    cdef ArrowArray array
+cdef class Array:
+
+    cdef:
+        ArrowArray* array_ptr
+        ArrowSchema* schema_ptr
+        bint own_data
+        bint own_ptrs
+        ArrowArrayView array_view
+        object parent
+
+    def __init__(self):
+        raise TypeError("Do not call constructor directly")
+
+    cdef void init(self, ArrowArray* array_ptr, ArrowSchema* schema_ptr) 
except *:
+        self.array_ptr = array_ptr
+        self.schema_ptr = schema_ptr
+        self.own_data = True
+
+    def __dealloc__(self):
+        if self.own_data:
+            self.array_ptr.release(self.array_ptr)
+            self.schema_ptr.release(self.schema_ptr)
+        if self.own_ptrs:
+            if self.array_ptr is not NULL:
+                free(self.array_ptr)
+                self.array_ptr = NULL
+            if self.schema_ptr is not NULL:
+                free(self.schema_ptr)
+                self.schema_ptr = NULL
+        self.parent = None
+
+    @property
+    def format(self):
+        cdef const char* format_string = deref(self.schema_ptr).format

Review Comment:
   With the latest nanoarrow you can do:
   
   
https://github.com/apache/arrow-nanoarrow/blob/7395bf25275679f6a4fbc35c7cf9bc8fbd6b6150/r/src/schema.c#L146-L149



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -44,14 +47,82 @@ cdef dict _numpy_type_map = {
 }
 
 
-def as_numpy_array(arr):
-    cdef ArrowSchema schema
-    cdef ArrowArray array
+cdef class Array:
+
+    cdef:
+        ArrowArray* array_ptr
+        ArrowSchema* schema_ptr
+        bint own_data
+        bint own_ptrs
+        ArrowArrayView array_view
+        object parent
+
+    def __init__(self):
+        raise TypeError("Do not call constructor directly")
+
+    cdef void init(self, ArrowArray* array_ptr, ArrowSchema* schema_ptr) 
except *:
+        self.array_ptr = array_ptr
+        self.schema_ptr = schema_ptr
+        self.own_data = True
+
+    def __dealloc__(self):
+        if self.own_data:
+            self.array_ptr.release(self.array_ptr)
+            self.schema_ptr.release(self.schema_ptr)
+        if self.own_ptrs:
+            if self.array_ptr is not NULL:
+                free(self.array_ptr)
+                self.array_ptr = NULL
+            if self.schema_ptr is not NULL:
+                free(self.schema_ptr)
+                self.schema_ptr = NULL
+        self.parent = None
+
+    @property
+    def format(self):
+        cdef const char* format_string = deref(self.schema_ptr).format
+        if format_string == NULL:
+            return None
+        else:
+            return format_string.decode('utf8')
+
+    @staticmethod
+    cdef Array _from_ptrs(ArrowArray* array_ptr, ArrowSchema* schema_ptr, bint 
own_ptrs=False):
+        cdef Array self = Array.__new__(Array)
+        self.init(array_ptr, schema_ptr)
+        self.own_ptrs = own_ptrs
+        return self
+
+    @classmethod
+    def from_pointers(cls, array_ptr, schema_ptr):
+        cdef:
+            ArrowArray* c_array_ptr = <ArrowArray*> <uintptr_t > array_ptr
+            ArrowSchema* c_schema_ptr = <ArrowSchema*> <uintptr_t > schema_ptr
+        return Array._from_ptrs(c_array_ptr, c_schema_ptr)
+
+    @classmethod
+    def from_pyarrow(cls, arr):
+        cdef ArrowSchema *schema = <ArrowSchema *>malloc(sizeof(ArrowSchema))
+        cdef ArrowArray *array = <ArrowArray *>malloc(sizeof(ArrowArray))
+        arr._export_to_c(<uintptr_t> array, <uintptr_t> schema)
+        self = Array._from_ptrs(array, schema, own_ptrs=True)
+        self.parent = arr

Review Comment:
   You probably don't need this? (I think the release callback takes care of 
keeping the `shared_ptr` alive for the lifecycle of the schema/array)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #62: [Python] Basic Array class wrapping C struct (with conversion to numpy)

Reply via email to