Re: [PR] GH-37484: [Python] Add a FixedSizeTensorScalar class [arrow]

via GitHub Wed, 07 Feb 2024 02:42:50 -0800


pitrou commented on code in PR #37533:
URL: https://github.com/apache/arrow/pull/37533#discussion_r1481255596



##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -199,10 +201,52 @@ std::shared_ptr<Array> FixedShapeTensorType::MakeArray(
     std::shared_ptr<ArrayData> data) const {
   DCHECK_EQ(data->type->id(), Type::EXTENSION);
   DCHECK_EQ("arrow.fixed_shape_tensor",
-            static_cast<const ExtensionType&>(*data->type).extension_name());
+            dynamic_cast<const ExtensionType&>(*data->type).extension_name());

Review Comment:
   Same question, I suppose the answer will be similar :-)



##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -18,7 +18,9 @@
 #include <numeric>
 #include <sstream>
 
+#include <arrow/scalar.h>

Review Comment:
   Can we avoid the gratuitous style inconsistency?
   ```suggestion
   #include "arrow/scalar.h"
   ```
   
   (will need to reformat afterthis)



##########
cpp/src/arrow/extension/fixed_shape_tensor_test.cc:
##########
@@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) {
   ASSERT_EQ(expected_3, result_3);
 }
 
+TEST_F(TestExtensionType, GetScalar) {
+  auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, 
dim_names_);
+
+  auto expected_data =
+      ArrayFromJSON(element_type_, "[[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 
22, 23]]");
+  auto storage_array = ArrayFromJSON(element_type_,
+                                     "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],"
+                                     "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 
22, 23]]");
+
+  auto sub_array = ExtensionType::WrapArray(ext_type, expected_data);
+  auto array = ExtensionType::WrapArray(ext_type, storage_array);
+
+  ASSERT_OK_AND_ASSIGN(auto expected_scalar, sub_array->GetScalar(0));
+  ASSERT_OK_AND_ASSIGN(auto actual_scalar, array->GetScalar(1));
+
+  ASSERT_OK(actual_scalar->ValidateFull());
+  ASSERT_TRUE(actual_scalar->type->Equals(*ext_type));
+  ASSERT_TRUE(actual_scalar->is_valid);
+
+  ASSERT_OK(expected_scalar->ValidateFull());
+  ASSERT_TRUE(expected_scalar->type->Equals(*ext_type));
+  ASSERT_TRUE(expected_scalar->is_valid);
+
+  AssertTypeEqual(actual_scalar->type, ext_type);
+  ASSERT_TRUE(actual_scalar->Equals(*expected_scalar));
+}
+
+TEST_F(TestExtensionType, GetTensor) {
+  auto arr = ArrayFromJSON(element_type_,
+                           "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],"
+                           "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 
23]]");
+  auto element_values =
+      std::vector<std::vector<int64_t>>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+                                        {12, 13, 14, 15, 16, 17, 18, 19, 20, 
21, 22, 23}};
+
+  auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, 
dim_names_);
+  auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, 
{"x", "y"});
+  auto exact_ext_type = 
internal::checked_pointer_cast<FixedShapeTensorType>(ext_type);
+  auto exact_permuted_ext_type =
+      internal::checked_pointer_cast<FixedShapeTensorType>(permuted_ext_type);
+
+  auto array = std::static_pointer_cast<FixedShapeTensorArray>(
+      ExtensionType::WrapArray(ext_type, arr));
+  auto permuted_array = std::static_pointer_cast<FixedShapeTensorArray>(
+      ExtensionType::WrapArray(permuted_ext_type, arr));
+
+  for (size_t i = 0; i < element_values.size(); i++) {
+    // Get tensor from extension array with trivial permutation
+    ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i));
+    auto actual_ext_scalar = 
internal::checked_pointer_cast<ExtensionScalar>(scalar);
+    ASSERT_OK_AND_ASSIGN(auto actual_tensor,
+                         exact_ext_type->MakeTensor(actual_ext_scalar));
+    ASSERT_OK_AND_ASSIGN(auto expected_tensor,
+                         Tensor::Make(value_type_, 
Buffer::Wrap(element_values[i]),
+                                      {3, 4}, {}, {"x", "y"}));
+    ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape());
+    ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names());
+    ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides());
+    ASSERT_EQ(actual_tensor->strides(), std::vector<int64_t>({32, 8}));
+    ASSERT_EQ(expected_tensor->type(), actual_tensor->type());
+    ASSERT_TRUE(expected_tensor->Equals(*actual_tensor));
+
+    // Get tensor from extension array with non-trivial permutation
+    ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor,
+                         Tensor::Make(value_type_, 
Buffer::Wrap(element_values[i]),
+                                      {4, 3}, {8, 24}, {"y", "x"}));
+    ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i));
+    ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor,
+                         exact_permuted_ext_type->MakeTensor(
+                             
internal::checked_pointer_cast<ExtensionScalar>(scalar)));
+    ASSERT_EQ(expected_permuted_tensor->strides(), 
actual_permuted_tensor->strides());
+    ASSERT_EQ(expected_permuted_tensor->shape(), 
actual_permuted_tensor->shape());
+    ASSERT_EQ(expected_permuted_tensor->dim_names(), 
actual_permuted_tensor->dim_names());
+    ASSERT_EQ(expected_permuted_tensor->type(), 
actual_permuted_tensor->type());
+    ASSERT_EQ(expected_permuted_tensor->is_contiguous(),
+              actual_permuted_tensor->is_contiguous());
+    ASSERT_EQ(expected_permuted_tensor->is_column_major(),
+              actual_permuted_tensor->is_column_major());
+    ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor));
+  }
+
+  // Test null values fail
+  auto element_type = fixed_size_list(int64(), 1);
+  auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]");
+  ext_type = fixed_shape_tensor(int64(), {1});
+  exact_ext_type = 
internal::checked_pointer_cast<FixedShapeTensorType>(ext_type);
+  auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr);
+  auto tensor_array = std::static_pointer_cast<FixedShapeTensorArray>(ext_arr);

Review Comment:
   Not sure if it's an issue either, but it would be worth taking a debugger 
and understanding exactly what happens :-)



##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -86,7 +88,7 @@ bool FixedShapeTensorType::ExtensionEquals(const 
ExtensionType& other) const {
   if (extension_name() != other.extension_name()) {
     return false;
   }
-  const auto& other_ext = static_cast<const FixedShapeTensorType&>(other);
+  const auto& other_ext = dynamic_cast<const FixedShapeTensorType&>(other);

Review Comment:
   Did it fail otherwise? I'm a bit curious to know if the change to 
`dynamic_cast` was necessary.
   
   (ideally this should be a `checked_cast`: i.e. dynamically checked in debug 
mode, not in release mode)



##########
cpp/src/arrow/extension/tensor_internal.h:
##########
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/print.h"

Review Comment:
   Should also include the necessary standard and Arrow headers, e.g.:
   ```suggestion
   #include <cstdint>
   #include <vector>
   
   #include "arrow/status.h"
   #include "arrow/util/print.h"
   ```
   



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1318,39 +1318,101 @@ def test_tensor_type():
     assert tensor_type.permutation is None
 
 
-def test_tensor_class_methods():
-    tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3])
-    storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
-                       pa.list_(pa.float32(), 6))
[email protected]("value_type", (np.int8(), np.int64(), np.float32()))
+def test_tensor_class_methods(value_type):
+    from numpy.lib.stride_tricks import as_strided
+    arrow_type = pa.from_numpy_dtype(value_type)
+
+    tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3])
+    storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+                       pa.list_(arrow_type, 6))
     arr = pa.ExtensionArray.from_storage(tensor_type, storage)
     expected = np.array(
-        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
-    result = arr.to_numpy_ndarray()
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+    np.testing.assert_array_equal(arr.to_tensor(), expected)
+    np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected)
+
+    expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+    result = arr[1:].to_numpy_ndarray()
     np.testing.assert_array_equal(result, expected)
 
-    expected = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
-    result = arr[:1].to_numpy_ndarray()
+    values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
+    flat_arr = np.array(values[0], dtype=value_type)
+    bw = value_type.itemsize
+    storage = pa.array(values, pa.list_(arrow_type, 12))
+
+    tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 
1, 2])
+    result = pa.ExtensionArray.from_storage(tensor_type, storage)
+    expected = np.array(
+        [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], 
dtype=value_type)
+    np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+    result = flat_arr.reshape(1, 2, 3, 2)
+    expected = np.array(
+        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], 
dtype=value_type)
     np.testing.assert_array_equal(result, expected)
 
-    arr = np.array(
-        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
-        dtype=np.float32, order="C")
+    tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 
2, 1])
+    result = pa.ExtensionArray.from_storage(tensor_type, storage)
+    expected = as_strided(flat_arr, shape=(1, 2, 3, 2),
+                          strides=(bw * 12, bw * 6, bw, bw * 3))
+    np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+    tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 
0, 1])
+    result = pa.ExtensionArray.from_storage(tensor_type, storage)
+    expected = as_strided(flat_arr, shape=(1, 3, 2, 2),
+                          strides=(bw * 12, bw, bw * 6, bw * 2))
+    np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+    assert result.type.permutation == [2, 0, 1]
+    assert result.type.shape == [2, 2, 3]
+    assert result.to_tensor().shape == (1, 3, 2, 2)
+    assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw)
+
+
[email protected]("value_type", (np.int8(), np.int64(), np.float32()))
+def test_tensor_array_from_numpy(value_type):
+    from numpy.lib.stride_tricks import as_strided
+    arrow_type = pa.from_numpy_dtype(value_type)
+
+    arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                   dtype=value_type, order="C")
     tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
     assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
-    assert tensor_array_from_numpy.type.value_type == pa.float32()
+    assert tensor_array_from_numpy.type.value_type == arrow_type
     assert tensor_array_from_numpy.type.shape == [2, 3]
 
-    arr = np.array(
-        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
-        dtype=np.float32, order="F")
-    with pytest.raises(ValueError, match="C-style contiguous segment"):
+    arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
+                   dtype=value_type, order="F")
+    with pytest.raises(ValueError, match="First stride needs to be largest"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 
2, 1])
-    storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], 
pa.list_(pa.int8(), 12))
-    arr = pa.ExtensionArray.from_storage(tensor_type, storage)
-    with pytest.raises(ValueError, match="non-permuted tensors"):
-        arr.to_numpy_ndarray()
+    flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 
dtype=value_type)
+    bw = value_type.itemsize
+
+    arr = flat_arr.reshape(1, 3, 4)
+    tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+    assert tensor_array_from_numpy.type.shape == [3, 4]
+    assert tensor_array_from_numpy.type.permutation == [0, 1]
+    assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr)
+
+    arr = as_strided(flat_arr, shape=(1, 2, 3, 2),
+                     strides=(bw * 12, bw * 6, bw, bw * 3))
+    tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+    assert tensor_array_from_numpy.type.shape == [2, 2, 3]
+    assert tensor_array_from_numpy.type.permutation == [0, 2, 1]
+    assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr)
+
+    arr = flat_arr.reshape(1, 2, 3, 2)
+    result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
+    expected = np.array(
+        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], 
dtype=value_type)
+    np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
+
+    arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], 
dtype=value_type)
+    expected = arr[1:]
+    result = 
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray()
+    np.testing.assert_array_equal(result, expected)

Review Comment:
   Thank you!



##########
python/pyarrow/array.pxi:
##########
@@ -3573,17 +3602,19 @@ class FixedShapeTensorArray(ExtensionArray):
           ]
         ]
         """
-        if not obj.flags["C_CONTIGUOUS"]:
-            raise ValueError('The data in the numpy array need to be in a 
single, '
-                             'C-style contiguous segment.')
+
+        permutation = (-np.array(obj.strides)).argsort(kind='stable')
+        if permutation[0] != 0:
+            raise ValueError('First stride needs to be largest to ensure that '
+                             'individual tensor data is contiguous in memory.')
 
         arrow_type = from_numpy_dtype(obj.dtype)
-        shape = obj.shape[1:]
-        size = obj.size / obj.shape[0]
+        shape = np.take(obj.shape, permutation)
+        values = np.ravel(obj, order="K")

Review Comment:
   Hmm, I wouldn't be so sure, example:
   ```python
   >>> obj = np.arange(24).reshape(2,3,4)[:,::2]
   >>> obj
   array([[[ 0,  1,  2,  3],
           [ 8,  9, 10, 11]],
   
          [[12, 13, 14, 15],
           [20, 21, 22, 23]]])
   >>> obj.strides
   (96, 64, 8)
   >>> permutation = (-np.array(obj.strides)).argsort(kind='stable')
   >>> permutation
   array([0, 1, 2])   # permutation is ok
   >>> values = np.ravel(obj, order="K")
   >>> values
   array([ 0,  1,  2,  3,  8,  9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23])
   >>> values[0] = 999
   >>> values
   array([999,   1,   2,   3,   8,   9,  10,  11,  12,  13,  14,  15,  20,
           21,  22,  23])
   >>> obj
   array([[[ 0,  1,  2,  3],
           [ 8,  9, 10, 11]],
   
          [[12, 13, 14, 15],
           [20, 21, 22, 23]]])
   
   # values is a copy! which we can also check using:
   >>> values.ctypes.data == obj.ctypes.data
   False   # base addresses are different
   ```
   
   Of course, we may not really care about this, since the conversion is 
probably correct anyway. But we may want to add a mention in the docstring that 
the conversion is only zero-copy if the input array is contiguous.



##########
cpp/src/arrow/extension/fixed_shape_tensor.cc:
##########
@@ -293,53 +337,71 @@ const Result<std::shared_ptr<Tensor>> 
FixedShapeTensorArray::ToTensor() const {
   // To convert an array of n dimensional tensors to a n+1 dimensional tensor 
we
   // interpret the array's length as the first dimension the new tensor.
 
-  auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
-  auto ext_type = 
internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
-  ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
-                  Status::Invalid(ext_arr->value_type()->ToString(),
-                                  " is not valid data type for a tensor"));
-  auto permutation = ext_type->permutation();
-
-  std::vector<std::string> dim_names;
-  if (!ext_type->dim_names().empty()) {
-    for (auto i : permutation) {
-      dim_names.emplace_back(ext_type->dim_names()[i]);
-    }
-    dim_names.insert(dim_names.begin(), 1, "");
+  const auto ext_type =
+      internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
+  const auto value_type = ext_type->value_type();
+  ARROW_RETURN_IF(
+      !is_fixed_width(*value_type),
+      Status::TypeError(value_type->ToString(), " is not valid data type for a 
tensor"));
+
+  // ext_type->permutation() gives us permutation for a single row with values 
in
+  // range [0, ndim). Here want to create a ndim + 1 dimensional tensor from 
the entire
+  // array and we assume the first dimension will always have the greatest 
stride, so it
+  // will get permutation index 0 and remaining values from 
ext_type->permutation() need
+  // to be shifted to fill the [1, ndim+1) range. Computed permutation will be 
used to
+  // generate the new tensor's shape, strides and dim_names.

Review Comment:
   +1, thank you!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-37484: [Python] Add a FixedSizeTensorScalar class [arrow]

Reply via email to