jorisvandenbossche commented on code in PR #34883:
URL: https://github.com/apache/arrow/pull/34883#discussion_r1158473676
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1127,3 +1127,89 @@ def test_cpp_extension_in_python(tmpdir):
reconstructed_array = batch.column(0)
assert reconstructed_array.type == uuid_type
assert reconstructed_array == array
+
+
+def test_tensor_type():
+ tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.int8(), 6)
+ assert tensor_type.shape == [2, 3]
+ assert tensor_type.dim_names is None
+ assert tensor_type.permutation is None
+
+ tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3],
+ permutation=[0, 2, 1])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.float64(), 12)
+ assert tensor_type.shape == [2, 2, 3]
+ assert tensor_type.dim_names is None
+ assert tensor_type.permutation == [0, 2, 1]
+
+ tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3],
+ dim_names=['C', 'H', 'W'])
+ assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+ assert tensor_type.storage_type == pa.list_(pa.bool_(), 12)
+ assert tensor_type.shape == [2, 2, 3]
+ assert tensor_type.dim_names == ['C', 'H', 'W']
+ assert tensor_type.permutation is None
+
+
[email protected]("numpy_order", ('C', 'F'))
+def test_tensor_class_methods(numpy_order):
+ tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3])
+ storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
+ pa.list_(pa.float32(), 6))
+ arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+ expected = np.array(
+ [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
+
+ result = arr.to_numpy_ndarray()
+ np.testing.assert_array_equal(result, expected)
+
+ arr = np.array(
+ [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+ dtype=np.float32, order=numpy_order)
+ tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+ assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
+ assert tensor_array_from_numpy.type.value_type == pa.float32()
+ assert tensor_array_from_numpy.type.shape == [2, 3]
Review Comment:
Can you also test the roundtrip here? Converting back to the ndarray
(to_numpy_ndarray), and comparing with original.
I am also not sure this is currently correct for non-C-contiguous arrays. In
`from_numpy_ndarray`, I think we currently just assume we get a C contiguous
(row major) one, since that is what the spec assumes. Passing a
non-C-contiguous just as is might give unexpected results.
##########
python/pyarrow/array.pxi:
##########
@@ -3090,6 +3090,69 @@ cdef class ExtensionArray(Array):
return self.storage.to_numpy(**kwargs)
+class FixedShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for fixed shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2])
Review Comment:
This needs to be updated now
##########
python/pyarrow/array.pxi:
##########
@@ -3090,6 +3090,69 @@ cdef class ExtensionArray(Array):
return self.storage.to_numpy(**kwargs)
+class FixedShapeTensorArray(ExtensionArray):
+ """
+ Concrete class for fixed shape tensor extension arrays.
+
+ Examples
+ --------
+ Define the extension type for tensor array
+
+ >>> import pyarrow as pa
+ >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2])
+
+ Create an extension array
+
+ >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ >>> pa.ExtensionArray.from_storage(tensor_type, storage)
+ <pyarrow.lib.FixedShapeTensorArray object at ...>
+ [
+ [
+ 1,
+ 2,
+ 3,
+ 4
+ ],
+ [
+ 10,
+ 20,
+ 30,
+ 40
+ ],
+ [
+ 100,
+ 200,
+ 300,
+ 400
+ ]
+ ]
+ """
+
+ def to_numpy_ndarray(self):
+ """
+ Convert fixed shape tensor extension array to a numpy array (with
dim+1).
+ """
+ np_flat = np.asarray(self.storage.values)
+ numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape),
+ order='C')
+
+ return numpy_tensor
+
+ def from_numpy_ndarray(obj):
+ """
+ Convert numpy tensors (ndarrays) to a fixed shape tensor extension
array.
Review Comment:
Need to add a Parameters section to make linting happy
I would also add a bit more explanation about that the first dimension
becomes the length of the tensor array etc
##########
python/pyarrow/types.pxi:
##########
@@ -4543,6 +4623,100 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def fixed_shape_tensor(DataType value_type, shape, dim_names=None,
permutation=None):
+ """
+ Create instance of fixed shape tensor extension type with shape and
optional
+ names of tensor dimensions and indices of the desired ordering.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ shape : tuple or list of integers
+ The physical shape of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
+
+ Examples
+ --------
+ Create an instance of fixed shape tensor extension type:
+
+ >>> import pyarrow as pa
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+ >>> tensor_type
+ FixedShapeTensorType(extension<arrow.fixed_shape_tensor>)
+
+ Inspect the data type:
+
+ >>> tensor_type.value_type
+ DataType(int32)
+ >>> tensor_type.shape
+ [2, 2]
+
+ Create a table with fixed shape tensor extension array:
+
+ >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage)
+ >>> pa.table([tensor], names=["tensor_array"])
+ pyarrow.Table
+ tensor_array: extension<arrow.fixed_shape_tensor>
+ ----
+ tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]]
+
+ Create an instance of fixed shape tensor extension type with names
+ of tensor dimensions:
+
+ >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3),
+ ... dim_names=['C', 'H', 'W'])
+ >>> tensor_type.dim_names
+ [b'C', b'H', b'W']
Review Comment:
```suggestion
['C', 'H', 'W']
```
##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -2619,6 +2619,31 @@ cdef extern from "arrow/extension_type.h" namespace
"arrow":
shared_ptr[CArray] storage()
+cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace
"arrow::extension":
+ cdef cppclass CFixedShapeTensorType \
+ " arrow::extension::FixedShapeTensorType"(CExtensionType):
+
+ @staticmethod
+ CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]&
value_type,
+ const vector[int64_t]& shape,
+ const vector[int64_t]& permutation,
+ const vector[c_string]& dim_names)
+
+ CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType]
storage_type,
+ const c_string&
serialized_data) const
+
+ c_string Serialize() const
+
+ const shared_ptr[CDataType] value_type()
+ const vector[int64_t] shape()
+ const vector[int64_t] permutation()
+ const vector[c_string] dim_names()
+
+ CFixedShapeTensorType(shared_ptr[CDataType]& value_type, int32_t& size,
+ vector[int64_t]& shape, vector[int64_t]&
permutation,
+ vector[c_string]& dim_names)
Review Comment:
This is not actually used at the moment?
##########
python/pyarrow/types.pxi:
##########
@@ -4543,6 +4623,100 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def fixed_shape_tensor(DataType value_type, shape, dim_names=None,
permutation=None):
+ """
+ Create instance of fixed shape tensor extension type with shape and
optional
+ names of tensor dimensions and indices of the desired ordering.
+
+ Parameters
+ ----------
+ value_type : DataType
+ Data type of individual tensor elements.
+ shape : tuple or list of integers
+ The physical shape of the contained tensors.
+ dim_names : tuple or list of strings, default None
+ Explicit names to tensor dimensions.
+ permutation : tuple or list integers, default None
+ Indices of the desired ordering of the original dimensions.
Review Comment:
I would expand this explanation a bit (I think it's fine to copy part of the
explanation of this parameter in the spec)
##########
python/pyarrow/public-api.pxi:
##########
@@ -118,6 +119,8 @@ cdef api object pyarrow_wrap_data_type(
cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
if cpy_ext_type != nullptr:
return cpy_ext_type.GetInstance()
+ elif ext_type.extension_name() == tensor_name:
Review Comment:
Just wondering, does it work as well to just do here `.. ==
b"arrow.fixed_shape_tensor"` (that would avoid having to define the name
earlier in the code, which is a bit easier to read)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]