rok commented on code in PR #38008:
URL: https://github.com/apache/arrow/pull/38008#discussion_r1426742968
##########
python/pyarrow/array.pxi:
##########
@@ -3586,6 +3586,160 @@ class FixedShapeTensorArray(ExtensionArray):
)
+cdef class VariableShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for variable shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2)
+
+ Create an extension array
+
+ >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2))
+ >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64()))
+ >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape",
"data"])
+ >>> pa.ExtensionArray.from_storage(tensor_type, arr)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: double>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert variable shape tensor extension array to list of numpy arrays.
+ If permutation is non-trivial a strided numpy arrays are produced.
+ """
+ tensors = []
+ for i in range(len(self.storage)):
+ tensors.append(self.get_tensor(i).to_numpy())
+
+ return tensors
+
+ def get_tensor(self, int64_t i):
+ """
+ Get i-th tensor from variable shape tensor extension array.
+
+ Parameters
+ ----------
+ i : int64_t
+ The index of the tensor to get.
+
+ Returns
+ -------
+ tensor : pyarrow.Tensor
+ """
+ cdef:
+ CVariableShapeTensorArray* ext_array =
<CVariableShapeTensorArray*>(self.ap)
+ CResult[shared_ptr[CTensor]] ctensor
+ with nogil:
+ ctensor = ext_array.GetTensor(i)
+ return pyarrow_wrap_tensor(GetResultValue(ctensor))
+
+ @staticmethod
+ def from_numpy_ndarray(obj):
+ """
+ Convert a list of numpy arrays ndarrays to a variable shape tensor
extension array.
+ The length of the list will become the length of the variable shape
tensor array.
+
+ Parameters
+ ----------
+ obj : list(numpy.ndarray)
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> ndarray_list = [
+ ... np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+ ... np.array([[7, 8]], dtype=np.float32),
+ ... ]
+ >>> pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list)
+ <pyarrow.lib.VariableShapeTensorArray object at ...>
+ -- is_valid: all not null
+ -- child 0 type: fixed_size_list<item: int32>[2]
+ [
+ [
+ 2,
+ 3
+ ],
+ [
+ 1,
+ 2
+ ]
+ ]
+ -- child 1 type: list<item: float>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6
+ ],
+ [
+ 7,
+ 8
+ ]
+ ]
+ """
+ assert isinstance(obj, list), 'obj must be a list of numpy arrays'
+ numpy_type = obj[0].dtype
+ ndim = obj[0].ndim
+ bw = numpy_type.itemsize
+
+ permutation = (-np.array(obj[0].strides)).argsort()
+
+ if not all([o.dtype == numpy_type for o in obj]):
+ raise TypeError('All numpy arrays need to have the same dtype.')
+
+ if not all([o.ndim == ndim for o in obj]):
+ raise ValueError('All numpy arrays need to have the same ndim.')
+
+ if not all([np.array_equal((-np.array(o.strides)).argsort(),
permutation) for o in obj]):
+ raise ValueError('All numpy arrays need to have the same
permutation.')
+
+ arrow_type=from_numpy_dtype(numpy_type)
+ values=array([np.lib.stride_tricks.as_strided(
+ o, shape=(np.prod(o.shape),), strides=(bw,)) for o in obj],
list_(arrow_type))
+ shapes=array([o.shape for o in obj], list_(int32(), list_size=ndim))
+ struct_arr=StructArray.from_arrays([shapes, values], names=["shape",
"data"])
Review Comment:
Done.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]