jorisvandenbossche commented on code in PR #34883:
URL: https://github.com/apache/arrow/pull/34883#discussion_r1158076126
##########
python/pyarrow/array.pxi:
##########
@@ -3090,6 +3090,71 @@ cdef class ExtensionArray(Array):
return self.storage.to_numpy(**kwargs)
+class FixedShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for fixed shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2])
+
+ Create an extension array
+
+ >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ >>> pa.ExtensionArray.from_storage(tensor_type, storage)
+ <pyarrow.lib.FixedShapeTensorArray object at ...>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4
+ ],
+ [
+ 10,
+ 20,
+ 30,
+ 40
+ ],
+ [
+ 100,
+ 200,
+ 300,
+ 400
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert fixed shape tensor extension array to a numpy array (with
dim+1).
+ """
+ np_flat = np.asarray(self.storage.values)
+ numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape),
+ order='C')
+
+ return numpy_tensor
+
+ def from_numpy_ndarray(obj):
+ """
+ Convert numpy tensors (ndarrays) to a fixed shape tensor extension
array.
+ """
+ numpy_type = obj.flatten().dtype
+ arrow_type = from_numpy_dtype(numpy_type)
Review Comment:
```suggestion
arrow_type = from_numpy_dtype(obj.dtype)
```
You don't need to flatten the array to get the dtype
##########
python/pyarrow/array.pxi:
##########
@@ -3090,6 +3090,71 @@ cdef class ExtensionArray(Array):
return self.storage.to_numpy(**kwargs)
+class FixedShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for fixed shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2])
+
+ Create an extension array
+
+ >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ >>> pa.ExtensionArray.from_storage(tensor_type, storage)
+ <pyarrow.lib.FixedShapeTensorArray object at ...>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4
+ ],
+ [
+ 10,
+ 20,
+ 30,
+ 40
+ ],
+ [
+ 100,
+ 200,
+ 300,
+ 400
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert fixed shape tensor extension array to a numpy array (with
dim+1).
+ """
+ np_flat = np.asarray(self.storage.values)
+ numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape),
+ order='C')
+
+ return numpy_tensor
+
+ def from_numpy_ndarray(obj):
+ """
+ Convert numpy tensors (ndarrays) to a fixed shape tensor extension
array.
+ """
+ numpy_type = obj.flatten().dtype
+ arrow_type = from_numpy_dtype(numpy_type)
+ shape = obj.shape[1:]
+ size = obj.size / obj.shape[0]
+
+ return ExtensionArray.from_storage(
+ FixedShapeTensorType(arrow_type, shape),
+ array([t.flatten() for t in obj],
+ list_(arrow_type, size))
Review Comment:
Can you try to use `pa.FixedSizeListArray.from_arrays(..)` to do this
zero-copy?
##########
python/pyarrow/types.pxi:
##########
@@ -4543,6 +4615,100 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def fixedshapetensor(DataType value_type, shape, dim_names=None,
permutation=None):
Review Comment:
```suggestion
def fixed_shape_tensor(DataType value_type, shape, dim_names=None,
permutation=None):
```
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1127,3 +1127,86 @@ def test_cpp_extension_in_python(tmpdir):
reconstructed_array = batch.column(0)
assert reconstructed_array.type == uuid_type
assert reconstructed_array == array
+
+
+def test_tensor_type():
+ tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 3))
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.int8(), 6)
+
+
+def test_tensor_class_methods():
+ tensor_type = pa.FixedShapeTensorType(pa.float32(), (2, 3))
+ storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
+ pa.list_(pa.float32(), 6))
+ arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+ expected = np.array(
+ [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
+
+ result = arr.to_numpy_ndarray()
+ np.testing.assert_array_equal(result, expected)
+
+ tensor_array_from_numpy =
pa.FixedShapeTensorArray.from_numpy_ndarray(expected)
+ assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
+ assert tensor_array_from_numpy.type.value_type == pa.float32()
+ assert tensor_array_from_numpy.type.shape == [2, 3]
Review Comment:
Can you add some more tests for from_numpy_ndarray? (for example starting
from a numpy array that isn't C contiguous)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]