jorisvandenbossche commented on code in PR #38008:
URL: https://github.com/apache/arrow/pull/38008#discussion_r1423729406
##########
python/pyarrow/array.pxi:
##########
@@ -3586,6 +3586,160 @@ class FixedShapeTensorArray(ExtensionArray):
)
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64()))
+ >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape",
"data"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: double>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert variable shape tensor extension array to list of numpy arrays.
+ If permutation is non-trivial a strided numpy arrays are produced.
+ """
+ tensors = []
+ for i in range(len(self.storage)):
+ tensors.append(self.get_tensor(i).to_numpy())
+
+ return tensors
+
+ def get_tensor(self, int64_t i):
+ """
+ Get i-th tensor from variable shape tensor extension array.
+
+ Parameters
+ ----------
+ i : int64_t
+ The index of the tensor to get.
+
+ Returns
+ -------
+ tensor : pyarrow.Tensor
+ """
+ cdef:
+ CVariableShapeTensorArray* ext_array =
<CVariableShapeTensorArray*>(self.ap)
+ CResult[shared_ptr[CTensor]] ctensor
+ with nogil:
+ ctensor = ext_array.GetTensor(i)
+ return pyarrow_wrap_tensor(GetResultValue(ctensor))
+
+ @staticmethod
+ def from_numpy_ndarray(obj):
+ """
+ Convert a list of numpy arrays ndarrays to a variable shape tensor
extension array.
+ The length of the list will become the length of the variable shape
tensor array.
+
+ Parameters
+ ----------
+ obj : list(numpy.ndarray)
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> ndarray_list = [
+ ... np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+ ... np.array([[7, 8]], dtype=np.float32),
+ ... ]
+ >>> pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: float>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+ assert isinstance(obj, list), 'obj must be a list of numpy arrays'
+ numpy_type = obj[0].dtype
+ ndim = obj[0].ndim
+ bw = numpy_type.itemsize
+
+ permutation = (-np.array(obj[0].strides)).argsort()
+
+ if not all([o.dtype == numpy_type for o in obj]):
+ raise TypeError('All numpy arrays need to have the same dtype.')
+
+ if not all([o.ndim == ndim for o in obj]):
+ raise ValueError('All numpy arrays need to have the same ndim.')
+
+ if not all([np.array_equal((-np.array(o.strides)).argsort(),
permutation) for o in obj]):
+ raise ValueError('All numpy arrays need to have the same
permutation.')
+
+ arrow_type=from_numpy_dtype(numpy_type)
+ values=array([np.lib.stride_tricks.as_strided(
+ o, shape=(np.prod(o.shape),), strides=(bw,)) for o in obj],
list_(arrow_type))
+ shapes=array([o.shape for o in obj], list_(int32(), list_size=ndim))
+ struct_arr=StructArray.from_arrays([shapes, values], names=["shape",
"data"])
Review Comment:
Can you apply some PEP8 formatting here?
##########
python/pyarrow/array.pxi:
##########
@@ -3586,6 +3586,160 @@ class FixedShapeTensorArray(ExtensionArray):
)
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64()))
+ >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape",
"data"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: double>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert variable shape tensor extension array to list of numpy arrays.
+ If permutation is non-trivial a strided numpy arrays are produced.
+ """
+ tensors = []
+ for i in range(len(self.storage)):
+ tensors.append(self.get_tensor(i).to_numpy())
+
+ return tensors
+
+ def get_tensor(self, int64_t i):
+ """
+ Get i-th tensor from variable shape tensor extension array.
+
+ Parameters
+ ----------
+ i : int64_t
+ The index of the tensor to get.
+
+ Returns
+ -------
+ tensor : pyarrow.Tensor
+ """
+ cdef:
+ CVariableShapeTensorArray* ext_array =
<CVariableShapeTensorArray*>(self.ap)
+ CResult[shared_ptr[CTensor]] ctensor
+ with nogil:
+ ctensor = ext_array.GetTensor(i)
+ return pyarrow_wrap_tensor(GetResultValue(ctensor))
+
+ @staticmethod
+ def from_numpy_ndarray(obj):
+ """
+ Convert a list of numpy arrays ndarrays to a variable shape tensor
extension array.
+ The length of the list will become the length of the variable shape
tensor array.
+
+ Parameters
+ ----------
+ obj : list(numpy.ndarray)
Review Comment:
```suggestion
obj : list of numpy.ndarray
```
##########
python/pyarrow/types.pxi:
##########
@@ -4948,6 +5028,122 @@ def fixed_shape_tensor(DataType value_type, shape,
dim_names=None, permutation=N
return out
+def variable_shape_tensor(DataType value_type, ndim, dim_names=None,
permutation=None,
+ uniform_shape=None):
+ """
+ Create instance of variable shape tensor extension type with number of
+ dimensions and optional names of tensor dimensions and indices of the
+ desired logical ordering of dimensions.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ ndim : integer
+ The number of dimensions of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
+ The indices contain a permutation of the values ``[0, 1, .., N-1]``
where
+ N is the number of dimensions. The permutation indicates which
dimension
+ of the logical layout corresponds to which dimension of the physical
tensor.
+ For more information on this parameter see
+ :ref:`fixed_shape_tensor_extension`.
+ uniform_shape : tuple or list of integers, default None
+ Shape of dimensions that are guaranteed to stay constant over all
tensors
+ in the array if all their non-dimensions sizes were replaced by None.
Review Comment:
```suggestion
in the array if all their non-uniform sizes were replaced by None.
```
##########
python/pyarrow/array.pxi:
##########
@@ -3586,6 +3586,156 @@ class FixedShapeTensorArray(ExtensionArray):
)
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.uint32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32()))
+ >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape",
"data"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: uint32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: int32>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert variable shape tensor extension array to list of numpy arrays.
+ """
+ tensors = []
+ for i in range(len(self.storage)):
+ tensors.append(self.get_tensor(i).to_numpy())
+
+ return tensors
+
+ def get_tensor(self, int64_t i):
+ """
+ Get i-th tensor from variable shape tensor extension array.
+
+ Parameters
+ ----------
+ i : int64_t
+ The index of the tensor to get.
+
+ Returns
+ -------
+ tensor : pyarrow.Tensor
+ """
+ cdef:
+ CVariableShapeTensorArray* ext_array =
<CVariableShapeTensorArray*>(self.ap)
+ CResult[shared_ptr[CTensor]] ctensor
+ with nogil:
+ ctensor = ext_array.GetTensor(i)
Review Comment:
> > What if the i-th value is null?
>
> It returns a tensor with all dimension sizes == 0. Added a test.
Should it return None instead on the Python side? (an dim=0 tensor could in
theory be an actual value)
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1317,6 +1317,54 @@ def test_tensor_type():
assert tensor_type.dim_names == ['C', 'H', 'W']
assert tensor_type.permutation is None
+ tensor_type = pa.variable_shape_tensor(pa.int8(), 2)
Review Comment:
Can you make this its own test (and rename the one above to be specific
about fixed shape tensor)
##########
python/pyarrow/types.pxi:
##########
@@ -4948,6 +5028,122 @@ def fixed_shape_tensor(DataType value_type, shape,
dim_names=None, permutation=N
return out
+def variable_shape_tensor(DataType value_type, ndim, dim_names=None,
permutation=None,
+ uniform_shape=None):
+ """
+ Create instance of variable shape tensor extension type with number of
+ dimensions and optional names of tensor dimensions and indices of the
+ desired logical ordering of dimensions.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ ndim : integer
+ The number of dimensions of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
+ The indices contain a permutation of the values ``[0, 1, .., N-1]``
where
+ N is the number of dimensions. The permutation indicates which
dimension
+ of the logical layout corresponds to which dimension of the physical
tensor.
+ For more information on this parameter see
+ :ref:`fixed_shape_tensor_extension`.
+ uniform_shape : tuple or list of integers, default None
+ Shape of dimensions that are guaranteed to stay constant over all
tensors
+ in the array if all their non-dimensions sizes were replaced by None.
+
+ Examples
+ --------
+ Create an instance of variable shape tensor extension type:
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2)
+ >>> tensor_type
+ VariableShapeTensorType(extension<arrow.variable_shape_tensor>)
+
+ Inspect the data type:
+
+ >>> tensor_type.value_type
+ DataType(int32)
+ >>> tensor_type.ndim
+ 2
+
+ Create a table with variable shape tensor extension array:
+
+ >>> fields = [pa.field("shape", pa.list_(pa.int32(), 2)), pa.field("data",
pa.list_(pa.int32()))]
+ >>> storage = pa.array([([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])],
type=pa.struct(fields))
+ >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage)
+ >>> pa.table([tensor], names=["tensor_array"])
+ pyarrow.Table
+ tensor_array: extension<arrow.variable_shape_tensor>
+ ----
+ tensor_array: [ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [[2,3],[1,2]]
+ -- child 1 type: list<item: int32>
+ [[1,2,3,4,5,6],[7,8]]]
+
+ Create an instance of variable shape tensor extension type with names
+ of tensor dimensions:
+
+ >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3,
+ ... dim_names=['C', 'H', 'W'])
Review Comment:
```suggestion
... dim_names=['C', 'H', 'W'])
```
##########
python/pyarrow/array.pxi:
##########
@@ -3586,6 +3586,160 @@ class FixedShapeTensorArray(ExtensionArray):
)
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64()))
+ >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape",
"data"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: double>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert variable shape tensor extension array to list of numpy arrays.
+ If permutation is non-trivial a strided numpy arrays are produced.
+ """
+ tensors = []
+ for i in range(len(self.storage)):
+ tensors.append(self.get_tensor(i).to_numpy())
+
+ return tensors
+
+ def get_tensor(self, int64_t i):
+ """
+ Get i-th tensor from variable shape tensor extension array.
+
+ Parameters
+ ----------
+ i : int64_t
+ The index of the tensor to get.
+
+ Returns
+ -------
+ tensor : pyarrow.Tensor
+ """
+ cdef:
+ CVariableShapeTensorArray* ext_array =
<CVariableShapeTensorArray*>(self.ap)
+ CResult[shared_ptr[CTensor]] ctensor
+ with nogil:
+ ctensor = ext_array.GetTensor(i)
+ return pyarrow_wrap_tensor(GetResultValue(ctensor))
+
+ @staticmethod
+ def from_numpy_ndarray(obj):
+ """
+ Convert a list of numpy arrays ndarrays to a variable shape tensor
extension array.
+ The length of the list will become the length of the variable shape
tensor array.
+
+ Parameters
+ ----------
+ obj : list(numpy.ndarray)
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> ndarray_list = [
+ ... np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+ ... np.array([[7, 8]], dtype=np.float32),
+ ... ]
+ >>> pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list)
Review Comment:
Can you maybe also show the type of the resulting array? (which shows the
inferred dimension)
##########
python/pyarrow/types.pxi:
##########
@@ -1591,6 +1591,100 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar
+cdef class VariableShapeTensorType(BaseExtensionType):
+ """
+ Concrete class for variable shape tensor extension type.
+
+ Examples
+ --------
+ Create an instance of variable shape tensor extension type:
+
+ >>> import pyarrow as pa
+ >>> pa.variable_shape_tensor(pa.int32(), 2)
+ VariableShapeTensorType(extension<arrow.variable_shape_tensor>)
Review Comment:
Can you add a repr similar as we have for FixedShapeTensorType (that
includes the parametrization)
##########
python/pyarrow/types.pxi:
##########
@@ -4948,6 +5028,122 @@ def fixed_shape_tensor(DataType value_type, shape,
dim_names=None, permutation=N
return out
+def variable_shape_tensor(DataType value_type, ndim, dim_names=None,
permutation=None,
+ uniform_shape=None):
+ """
+ Create instance of variable shape tensor extension type with number of
+ dimensions and optional names of tensor dimensions and indices of the
+ desired logical ordering of dimensions.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ ndim : integer
+ The number of dimensions of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
+ The indices contain a permutation of the values ``[0, 1, .., N-1]``
where
+ N is the number of dimensions. The permutation indicates which
dimension
+ of the logical layout corresponds to which dimension of the physical
tensor.
+ For more information on this parameter see
+ :ref:`fixed_shape_tensor_extension`.
Review Comment:
Update link to variable shape tensor
##########
python/pyarrow/types.pxi:
##########
@@ -4948,6 +5028,122 @@ def fixed_shape_tensor(DataType value_type, shape,
dim_names=None, permutation=N
return out
+def variable_shape_tensor(DataType value_type, ndim, dim_names=None,
permutation=None,
+ uniform_shape=None):
+ """
+ Create instance of variable shape tensor extension type with number of
+ dimensions and optional names of tensor dimensions and indices of the
+ desired logical ordering of dimensions.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ ndim : integer
+ The number of dimensions of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
+ The indices contain a permutation of the values ``[0, 1, .., N-1]``
where
+ N is the number of dimensions. The permutation indicates which
dimension
+ of the logical layout corresponds to which dimension of the physical
tensor.
+ For more information on this parameter see
+ :ref:`fixed_shape_tensor_extension`.
+ uniform_shape : tuple or list of integers, default None
+ Shape of dimensions that are guaranteed to stay constant over all
tensors
+ in the array if all their non-dimensions sizes were replaced by None.
+
+ Examples
+ --------
+ Create an instance of variable shape tensor extension type:
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2)
+ >>> tensor_type
+ VariableShapeTensorType(extension<arrow.variable_shape_tensor>)
+
+ Inspect the data type:
+
+ >>> tensor_type.value_type
+ DataType(int32)
+ >>> tensor_type.ndim
+ 2
+
+ Create a table with variable shape tensor extension array:
+
+ >>> fields = [pa.field("shape", pa.list_(pa.int32(), 2)), pa.field("data",
pa.list_(pa.int32()))]
+ >>> storage = pa.array([([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])],
type=pa.struct(fields))
+ >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage)
+ >>> pa.table([tensor], names=["tensor_array"])
+ pyarrow.Table
+ tensor_array: extension<arrow.variable_shape_tensor>
+ ----
+ tensor_array: [ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [[2,3],[1,2]]
+ -- child 1 type: list<item: int32>
+ [[1,2,3,4,5,6],[7,8]]]
+
+ Create an instance of variable shape tensor extension type with names
+ of tensor dimensions:
+
+ >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3,
+ ... dim_names=['C', 'H', 'W'])
+ >>> tensor_type.dim_names
+ ['C', 'H', 'W']
+
+ Create an instance of variable shape tensor extension type with
+ permutation:
+
+ >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3,
+ ... permutation=[0, 2, 1])
Review Comment:
```suggestion
... permutation=[0, 2, 1])
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]