[GitHub] [arrow-nanoarrow] jorisvandenbossche commented on a diff in pull request #117: feat(python): Python schema, array, and array view skeleton

via GitHub Thu, 09 Mar 2023 12:49:14 -0800


jorisvandenbossche commented on code in PR #117:
URL: https://github.com/apache/arrow-nanoarrow/pull/117#discussion_r1131521826



##########
python/.gitignore:
##########
@@ -18,7 +18,8 @@
 
 src/nanoarrow/nanoarrow.c
 src/nanoarrow/nanoarrow.h
-src/nanoarrow/*.cpp
+src/nanoarrow/nanoarrow_c.pxd

Review Comment:
   Another option could be to actually include / commit this file? Although 
that might be annoying if a non-python related PR updates the c files, this 
gets out of sync (if we only update it in PRs that touch python). And in the 
end maybe not much different than the .c and .h files we also ignore
   



##########
python/bootstrap.py:
##########
@@ -0,0 +1,197 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import os
+import shutil
+
+# Generate the nanoarrow_c.pxd file used by the Cython extension
+class NanoarrowPxdGenerator:
+
+    def __init__(self):
+       self._define_regexes()
+
+    def generate_nanoarrow_pxd(self, file_in, file_out):
+        file_in_name = os.path.basename(file_in)
+
+        # Read the nanoarrow.h header
+        content = None
+        with open(file_in, 'r') as input:
+            content = input.read()
+
+        # Strip comments
+        content = self.re_comment.sub('', content)
+
+        # Find types and function definitions
+        types = self._find_types(content)
+        func_defs = self._find_func_defs(content)
+
+        # Make corresponding cython definitions
+        types_cython = [self._type_to_cython(t, '    ') for t in types]
+        func_defs_cython = [self._func_def_to_cython(d, '    ') for d in 
func_defs]
+
+        # Unindent the header
+        header = self.re_newline_plus_indent.sub('\n', self._pxd_header())
+
+        # Write nanoarrow_c.pxd
+        with open(file_out, 'wb') as output:
+            output.write(header.encode('UTF-8'))
+
+            output.write(f'\ncdef extern from 
"{file_in_name}":\n'.encode("UTF-8"))
+
+            # A few things we add in manually
+            output.write(b'\n')
+            output.write(b'    ctypedef int ArrowErrorCode\n')
+            output.write(b'    cdef int NANOARROW_OK\n')
+            output.write(b'\n')
+
+            for type in types_cython:
+                output.write(type.encode('UTF-8'))
+                output.write(b'\n\n')
+
+            for func_def in func_defs_cython:
+                output.write(func_def.encode('UTF-8'))
+                output.write(b'\n')
+
+            output.write(b'\n')
+
+    def _define_regexes(self):
+        self.re_comment = re.compile(r'\s*//[^\n]*')
+        self.re_type = re.compile(r'(?P<type>struct|union|enum) 
(?P<name>Arrow[^ ]+) {(?P<body>[^}]*)}')
+        self.re_func_def = re.compile(r'\n(static inline )?(?P<const>const 
)?(struct|enum )?(?P<return_type>[A-Za-z0-9_*]+) 
(?P<name>Arrow[A-Za-z]+)\((?P<args>[^\)]*)\);')
+        self.re_tagged_type = re.compile(r'(?P<type>struct|union|enum) 
(?P<name>Arrow[A-Za-z]+)')
+        self.re_struct_delim = re.compile(r';\s*')
+        self.re_enum_delim = re.compile(r',\s*')
+        self.re_whitespace = re.compile(r'\s+')
+        self.re_newline_plus_indent = re.compile(r'\n +')
+
+    def _strip_comments(self, content):
+        return self.re_comment.sub('', content)
+
+    def _find_types(self, content):
+        return [m.groupdict() for m in self.re_type.finditer(content)]
+
+    def _find_func_defs(self, content):
+        return [m.groupdict() for m in self.re_func_def.finditer(content)]
+
+    def _type_to_cython(self, t, indent=''):
+        type = t['type']
+        name = t['name']
+        body = self.re_tagged_type.sub(r'\2', t['body'].strip())
+        if type == 'enum':
+            items = [item for item in self.re_enum_delim.split(body) if item]
+        else:
+            items = [item for item in self.re_struct_delim.split(body) if item]
+
+        cython_body = f'\n{indent}    '.join([''] + items)
+        return f'{indent}cdef {type} {name}:{cython_body}'

Review Comment:
   Strictly speaking the `cdef` is unnecessary here (and the same for func 
below)
   
   The `cdef extern from ..` ensures cython already assumes this



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -84,3 +85,329 @@ def as_numpy_array(arr):
     # TODO set base
 
     return result
+
+
+def version():
+    return ArrowNanoarrowVersion().decode("UTF-8")
+
+cdef class CSchemaHolder:
+    cdef ArrowSchema c_schema
+
+    def __init__(self):
+        self.c_schema.release = NULL
+
+    def __del__(self):
+        if self.c_schema.release != NULL:
+          self.c_schema.release(&self.c_schema)
+
+    def _addr(self):
+        return <uintptr_t>&self.c_schema
+
+cdef class CArrayHolder:
+    cdef ArrowArray c_array
+
+    def __init__(self):
+        self.c_array.release = NULL
+
+    def __del__(self):
+        if self.c_array.release != NULL:
+          self.c_array.release(&self.c_array)
+
+    def _addr(self):
+        return <uintptr_t>&self.c_array
+
+cdef class CArrayViewHolder:
+    cdef ArrowArrayView c_array_view
+
+    def __init__(self):
+        ArrowArrayViewInitFromType(&self.c_array_view, 
NANOARROW_TYPE_UNINITIALIZED)
+
+    def __del__(self):
+        ArrowArrayViewReset(&self.c_array_view)
+
+    def _addr(self):
+        return <uintptr_t>&self.c_array_view
+
+cdef class CSchema:

Review Comment:
   Is the reason you call it CSchema (with C prefix) because this represents a 
"C" Data Interface schema?



##########
python/setup.py:
##########
@@ -17,29 +17,23 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import shutil
-from pathlib import Path
-
+import os
+import sys
+import subprocess
 from setuptools import Extension, setup
-
 import numpy as np
 
-
-# setuptools gets confused by relative paths that extend above the project root
-target = Path(__file__).parent / "src" / "nanoarrow"
-shutil.copy(
-    Path(__file__).parent / "../dist/nanoarrow.c", target / "nanoarrow.c"
-)
-shutil.copy(
-    Path(__file__).parent / "../dist/nanoarrow.h", target / "nanoarrow.h"
-)
+# Run bootstrap.py to run cmake generating a fresh bundle based on this
+# checkout or copy from ../dist if the caller doesn't have cmake available
+this_dir = os.path.dirname(__file__)
+subprocess.run([sys.executable, os.path.join(this_dir, 'bootstrap.py')])

Review Comment:
   I would maybe move the full content of `bootstrap.py` into `setup.py`, and 
then there is no need to call it here with a subprocess. 
   In the end, bootstrap.py is ran every time we build/install the project, and 
that is essentially the logic that typically belongs in setup.py (I also don't 
think there is a need to be able to call the bootstrap separately?)



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -84,3 +85,329 @@ def as_numpy_array(arr):
     # TODO set base
 
     return result
+
+
+def version():
+    return ArrowNanoarrowVersion().decode("UTF-8")
+
+cdef class CSchemaHolder:

Review Comment:
   Those "Holder" classes are only for when we "own" the pointer, and then to 
ensure t is properly released?  (to serve as the "base" object in that case, 
while if you create the CSchema/CArray class from another object that owns the 
pointer, this other object is the `_base`)



##########
python/setup.py:
##########
@@ -17,29 +17,23 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import shutil
-from pathlib import Path
-
+import os
+import sys
+import subprocess
 from setuptools import Extension, setup
-
 import numpy as np
 
-
-# setuptools gets confused by relative paths that extend above the project root
-target = Path(__file__).parent / "src" / "nanoarrow"
-shutil.copy(
-    Path(__file__).parent / "../dist/nanoarrow.c", target / "nanoarrow.c"
-)
-shutil.copy(
-    Path(__file__).parent / "../dist/nanoarrow.h", target / "nanoarrow.h"
-)
+# Run bootstrap.py to run cmake generating a fresh bundle based on this
+# checkout or copy from ../dist if the caller doesn't have cmake available
+this_dir = os.path.dirname(__file__)
+subprocess.run([sys.executable, os.path.join(this_dir, 'bootstrap.py')])

Review Comment:
   Eventually (later), we will also want to have some better logic to determine 
if this step needs to be run or not, for example when distributing sdists (see 
eg 
https://github.com/apache/arrow-adbc/blob/923e0408fe5a32cc6501b997fafa8316ace25fe0/python/adbc_driver_manager/setup.py#L33-L50)



##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -84,3 +85,329 @@ def as_numpy_array(arr):
     # TODO set base
 
     return result
+
+
+def version():
+    return ArrowNanoarrowVersion().decode("UTF-8")
+
+cdef class CSchemaHolder:
+    cdef ArrowSchema c_schema
+
+    def __init__(self):
+        self.c_schema.release = NULL
+
+    def __del__(self):
+        if self.c_schema.release != NULL:
+          self.c_schema.release(&self.c_schema)
+
+    def _addr(self):
+        return <uintptr_t>&self.c_schema
+
+cdef class CArrayHolder:
+    cdef ArrowArray c_array
+
+    def __init__(self):
+        self.c_array.release = NULL
+
+    def __del__(self):
+        if self.c_array.release != NULL:
+          self.c_array.release(&self.c_array)
+
+    def _addr(self):
+        return <uintptr_t>&self.c_array
+
+cdef class CArrayViewHolder:
+    cdef ArrowArrayView c_array_view
+
+    def __init__(self):
+        ArrowArrayViewInitFromType(&self.c_array_view, 
NANOARROW_TYPE_UNINITIALIZED)
+
+    def __del__(self):
+        ArrowArrayViewReset(&self.c_array_view)
+
+    def _addr(self):
+        return <uintptr_t>&self.c_array_view
+
+cdef class CSchema:
+    cdef object _base
+    cdef ArrowSchema* _ptr
+
+    @staticmethod
+    def Empty():
+        base = CSchemaHolder()
+        return CSchema(base, base._addr())
+
+    def __init__(self, object base, uintptr_t addr):
+        self._base = base,
+        self._ptr = <ArrowSchema*>addr
+
+    def _addr(self):
+        return <uintptr_t>self._ptr
+
+    def is_valid(self):
+        return self._ptr.release != NULL
+
+    def _assert_valid(self):
+        if self._ptr.release == NULL:
+            raise RuntimeError("schema is released")
+
+    def __repr__(self):
+        cdef int64_t n_chars = ArrowSchemaToString(self._ptr, NULL, 0, True)
+        cdef char* out = <char*>PyMem_Malloc(n_chars + 1)
+        if not out:
+            raise MemoryError()
+
+        ArrowSchemaToString(self._ptr, out, n_chars + 1, True)
+        out_str = out.decode("UTF-8")
+        PyMem_Free(out)
+
+        return out_str
+
+    @property
+    def format(self):
+        self._assert_valid()
+        if self._ptr.format != NULL:
+            return self._ptr.format.decode("UTF-8")
+
+    @property
+    def name(self):
+        self._assert_valid()
+        if self._ptr.name != NULL:
+            return self._ptr.name.decode("UTF-8")
+        else:
+            return None
+
+    @property
+    def flags(self):
+        return self._ptr.flags
+
+    @property
+    def children(self):
+        self._assert_valid()
+        return CSchemaChildren(self)
+
+    def parse(self):
+        self._assert_valid()
+
+        cdef ArrowError error
+        cdef ArrowSchemaView schema_view
+
+        cdef int result = ArrowSchemaViewInit(&schema_view, self._ptr, &error)
+        if result != NANOARROW_OK:
+            raise ValueError(ArrowErrorMessage(&error))
+
+        out = {
+            'name': self._ptr.name.decode('UTF-8') if self._ptr.name else None,
+            'type': ArrowTypeString(schema_view.type).decode('UTF-8'),
+            'storage_type': 
ArrowTypeString(schema_view.storage_type).decode('UTF-8')
+        }
+
+        if schema_view.storage_type in (NANOARROW_TYPE_FIXED_SIZE_LIST,
+                                        NANOARROW_TYPE_FIXED_SIZE_BINARY):
+            out['fixed_size'] = schema_view.fixed_size
+
+        if schema_view.storage_type in (NANOARROW_TYPE_DECIMAL128,
+                                        NANOARROW_TYPE_DECIMAL256):
+            out['decimal_bitwidth'] = schema_view.decimal_bitwidth
+            out['decimal_precision'] = schema_view.decimal_precision
+            out['decimal_scale'] = schema_view.decimal_scale
+
+        return out
+
+cdef class CArray:
+    cdef object _base
+    cdef ArrowArray* _ptr
+    cdef CSchema _schema
+
+    @staticmethod
+    def Empty(CSchema schema):
+        base = CArrayHolder()
+        return CArray(base, base._addr(), schema)
+
+    def __init__(self, object base, uintptr_t addr, CSchema schema):
+        self._base = base,
+        self._ptr = <ArrowArray*>addr
+        self._schema = schema
+
+    def _addr(self):
+        return <uintptr_t>self._ptr
+
+    def is_valid(self):
+        return self._ptr.release != NULL
+
+    def _assert_valid(self):
+        if self._ptr.release == NULL:
+            raise RuntimeError("Array is released")
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @property
+    def children(self):
+        return CArrayChildren(self)
+
+    def validate(self):
+        cdef CArrayViewHolder holder = CArrayViewHolder()
+
+        cdef ArrowError error
+        cdef int result = ArrowArrayViewInitFromSchema(&holder.c_array_view,
+                                                       self._schema._ptr, 
&error)
+        if result != NANOARROW_OK:
+            raise ValueError(ArrowErrorMessage(&error))
+
+        result = ArrowArrayViewSetArray(&holder.c_array_view, self._ptr, 
&error)
+        if result != NANOARROW_OK:
+            raise ValueError(ArrowErrorMessage(&error))
+
+        return CArrayView(holder, holder._addr(), self)

Review Comment:
   It sounds a bit strange that "validate" returns a array view object.
   
   (I would personally also add all properties of CArrayView like length, 
buffers, etc to the CArray class itself)



##########
python/tests/test_nanoarrow.py:
##########
@@ -1,27 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import numpy as np
 import pyarrow as pa
 
-import nanoarrow
+import nanoarrow as na
 
 import pytest
 
+def test_version():
+    assert(na.version() == "0.2.0-SNAPSHOT")
 
 def test_as_numpy_array():
-    
+
     arr = pa.array([1, 2, 3])
-    result = nanoarrow.as_numpy_array(arr)
+    result = na.as_numpy_array(arr)
     expected = arr.to_numpy()
     np.testing.assert_array_equal(result, expected)
 
     arr = pa.array([1, 2, 3], pa.uint8())
-    result = nanoarrow.as_numpy_array(arr)
+    result = na.as_numpy_array(arr)
     expected = arr.to_numpy()
     np.testing.assert_array_equal(result, expected)
 
     arr = pa.array([1, 2, None])
     with pytest.raises(ValueError, match="Cannot convert array with nulls"):
-        nanoarrow.as_numpy_array(arr)
+        na.as_numpy_array(arr)
 
     arr = pa.array([[1], [2, 3]])
     with pytest.raises(TypeError, match="Cannot convert a non-primitive 
array"):
-        nanoarrow.as_numpy_array(arr)
+        na.as_numpy_array(arr)
+
+def test_schema_basic():
+    # Blank invalid schema
+    schema = na.CSchema.Empty()
+    assert(schema.is_valid() is False)

Review Comment:
   ```suggestion
       assert schema.is_valid() is False
   ```
   
   Styling nitpick: those brackets are not needed
   
   (we should add black as pre-commit hook like in adbc, and that should also 
take care of this, I think: 
https://github.com/apache/arrow-adbc/blob/main/.pre-commit-config.yaml#L65)



##########
python/tests/test_nanoarrow.py:
##########
@@ -1,27 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import numpy as np
 import pyarrow as pa
 
-import nanoarrow
+import nanoarrow as na
 
 import pytest
 
+def test_version():
+    assert(na.version() == "0.2.0-SNAPSHOT")
 
 def test_as_numpy_array():
-    
+
     arr = pa.array([1, 2, 3])
-    result = nanoarrow.as_numpy_array(arr)
+    result = na.as_numpy_array(arr)
     expected = arr.to_numpy()
     np.testing.assert_array_equal(result, expected)
 
     arr = pa.array([1, 2, 3], pa.uint8())
-    result = nanoarrow.as_numpy_array(arr)
+    result = na.as_numpy_array(arr)
     expected = arr.to_numpy()
     np.testing.assert_array_equal(result, expected)
 
     arr = pa.array([1, 2, None])
     with pytest.raises(ValueError, match="Cannot convert array with nulls"):
-        nanoarrow.as_numpy_array(arr)
+        na.as_numpy_array(arr)
 
     arr = pa.array([[1], [2, 3]])
     with pytest.raises(TypeError, match="Cannot convert a non-primitive 
array"):
-        nanoarrow.as_numpy_array(arr)
+        na.as_numpy_array(arr)
+
+def test_schema_basic():
+    # Blank invalid schema
+    schema = na.CSchema.Empty()
+    assert(schema.is_valid() is False)
+    assert(repr(schema) == "[invalid: schema is released]")
+
+    pa_schema = pa.schema([pa.field("some_name", pa.int32())])
+    pa_schema._export_to_c(schema._addr())
+
+    assert(schema.format == "+s")
+    assert(schema.flags == 0)
+    assert(len(schema.children) == 1)
+    assert(schema.children[0].format == "i")
+    assert(schema.children[0].name == "some_name")
+    assert(repr(schema.children[0]) == "int32")
+
+    with pytest.raises(IndexError):
+        schema.children[1]
+
+def test_schema_parse():
+    schema = na.CSchema.Empty()
+    with pytest.raises(RuntimeError):
+        schema.parse()
+
+    pa.schema([pa.field("col1", pa.int32())])._export_to_c(schema._addr())
+
+    info = schema.parse()
+    assert(info['type'] == 'struct')
+    assert(info['storage_type'] == 'struct')
+    assert(info['name'] == '')
+
+    # Check on the child
+    child = schema.children[0]
+    child_info = child.parse()
+    assert(child_info['type'] == 'int32')
+    assert(child_info['storage_type'] == 'int32')
+    assert(child_info['name'] == 'col1')
+
+def test_schema_info_params():
+    schema = na.CSchema.Empty()
+    pa.binary(12)._export_to_c(schema._addr())

Review Comment:
   It might be nice to add a `from_pyarrow`-like helper to avoid this constant 
pattern of creating an empty and calling export_to_c (this could also be just 
an internal helper for the tests if we don't want to expose that publicly (yet))



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-nanoarrow] jorisvandenbossche commented on a diff in pull request #117: feat(python): Python schema, array, and array view skeleton

Reply via email to