pitrou commented on code in PR #37533: URL: https://github.com/apache/arrow/pull/37533#discussion_r1463307867
########## cpp/src/arrow/extension/fixed_shape_tensor.h: ########## @@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override; + /// \brief Create a Tensor from a Scalar value from a FixedShapeTensorArray + /// + /// This method will return a Tensor from FixedShapeTensorArray with strides Review Comment: It takes an ExtensionScalar, not a FixedShapeTensorArray? ########## cpp/src/arrow/extension/fixed_shape_tensor.cc: ########## @@ -203,6 +205,48 @@ std::shared_ptr<Array> FixedShapeTensorType::MakeArray( return std::make_shared<ExtensionArray>(data); } +Result<std::shared_ptr<Tensor>> FixedShapeTensorType::MakeTensor( + const std::shared_ptr<ExtensionScalar>& scalar) { + const auto ext_scalar = internal::checked_pointer_cast<ExtensionScalar>(scalar); + const auto ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(scalar->type); + if (!is_fixed_width(*ext_type->value_type())) { + return Status::TypeError("Cannot convert non-fixed-width values to Tensor."); + } + const auto array = + internal::checked_pointer_cast<const FixedSizeListScalar>(ext_scalar->value)->value; + if (array->null_count() > 0) { + return Status::Invalid("Cannot convert data with nulls to Tensor."); + } + const auto value_type = + internal::checked_pointer_cast<FixedWidthType>(ext_type->value_type()); + const auto byte_width = value_type->byte_width(); + + std::vector<int64_t> permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim()); + std::iota(permutation.begin(), permutation.end(), 0); + } + + std::vector<int64_t> shape = ext_type->shape(); + internal::Permute<int64_t>(permutation, &shape); + + std::vector<std::string> dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + internal::Permute<std::string>(permutation, &dim_names); + } + + std::vector<int64_t> strides; + ARROW_CHECK_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides)); Review Comment: `ARROW_CHECK_OK` will abort on error, can you please instead propagate it using `RETURN_NOT_OK`? ########## cpp/src/arrow/extension/fixed_shape_tensor.cc: ########## @@ -293,53 +337,83 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage()); - auto ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); + const auto ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::TypeError(value_type->ToString(), " is not valid data type for a tensor")); - std::vector<std::string> dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); + std::vector<int64_t> permutation = ext_type->permutation(); + if (permutation.empty()) { + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { + permutation.emplace_back(i); } - dim_names.insert(dim_names.begin(), 1, ""); - } else { - dim_names = {}; } + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { + permutation[i] += 1; + } + permutation.insert(permutation.begin(), 1, 0); - std::vector<int64_t> shape; - for (int64_t& i : permutation) { - shape.emplace_back(ext_type->shape()[i]); - ++i; + std::vector<std::string> dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + dim_names.insert(dim_names.begin(), 1, ""); + internal::Permute<std::string>(permutation, &dim_names); } + + std::vector<int64_t> shape = ext_type->shape(); + auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1), + std::multiplies<>()); shape.insert(shape.begin(), 1, this->length()); - permutation.insert(permutation.begin(), 1, 0); + internal::Permute<int64_t>(permutation, &shape); std::vector<int64_t> tensor_strides; - auto value_type = internal::checked_pointer_cast<FixedWidthType>(ext_arr->value_type()); + const auto fw_value_type = internal::checked_pointer_cast<FixedWidthType>(value_type); ARROW_RETURN_NOT_OK( - ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); - ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); + ComputeStrides(*fw_value_type.get(), shape, permutation, &tensor_strides)); + + const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1]; ARROW_ASSIGN_OR_RAISE( - auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, - tensor_strides, dim_names)); - return tensor; + const auto buffer, + SliceBufferSafe(raw_buffer, this->offset() * cell_size * value_type->byte_width())); + + return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names); } Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make( const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape, const std::vector<int64_t>& permutation, const std::vector<std::string>& dim_names) { - if (!permutation.empty() && shape.size() != permutation.size()) { - return Status::Invalid("permutation size must match shape size. Expected: ", - shape.size(), " Got: ", permutation.size()); + const auto ndim = shape.size(); + if (!permutation.empty() && ndim != permutation.size()) { + return Status::Invalid("permutation size must match shape size. Expected: ", ndim, + " Got: ", permutation.size()); } - if (!dim_names.empty() && shape.size() != dim_names.size()) { - return Status::Invalid("dim_names size must match shape size. Expected: ", - shape.size(), " Got: ", dim_names.size()); + if (!dim_names.empty() && ndim != dim_names.size()) { + return Status::Invalid("dim_names size must match shape size. Expected: ", ndim, + " Got: ", dim_names.size()); + } + if (!permutation.empty()) { + std::vector<int64_t> sorted_permutation = permutation; + std::sort(sorted_permutation.begin(), sorted_permutation.end()); Review Comment: Thanks for the update here. Two things: 1. this could probably be an internal helper function somewhere, since validation a permutation may pop up elsewhere? 2. there's no need to sort, you could simply check that the permutation is both injective and bijective. For example (untested): ```c++ std::vector<uint8_t> dim_seen(size, 0); for (const auto p : permutation) { if (p < 0 || p >= static_cast<int64_t>(size) || dim_seen[p] != 0) { return Status::Invalid("Invalid permutation: (some explanation)"); } dim_seen[p] = 1; } ``` ########## cpp/src/arrow/extension/fixed_shape_tensor.cc: ########## @@ -293,53 +337,83 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage()); - auto ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); + const auto ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::TypeError(value_type->ToString(), " is not valid data type for a tensor")); - std::vector<std::string> dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); + std::vector<int64_t> permutation = ext_type->permutation(); + if (permutation.empty()) { + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { + permutation.emplace_back(i); } - dim_names.insert(dim_names.begin(), 1, ""); - } else { - dim_names = {}; } + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { Review Comment: This is going to be executed also if permutation is empty, can you instead give it the right values directly? ########## cpp/src/arrow/extension/fixed_shape_tensor_test.cc: ########## @@ -308,19 +319,89 @@ TEST_F(TestExtensionType, TestFromTensorType) { auto dim_names = std::vector<std::vector<std::string>>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto cell_shapes = std::vector<std::vector<int64_t>>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto element_shapes = std::vector<std::vector<int64_t>>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector<std::vector<int64_t>>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, values, shapes[i], strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } +template <typename T> +void CheckToTensor(const std::vector<T>& values, const std::shared_ptr<DataType> typ, + const int32_t& element_size, const std::vector<int64_t>& element_shape, + const std::vector<int64_t>& element_permutation, + const std::vector<std::string>& element_dim_names, + const std::vector<int64_t>& tensor_shape, + const std::vector<std::string>& tensor_dim_names, + const std::vector<int64_t>& tensor_strides) { + auto buffer = Buffer::Wrap(values); + const std::shared_ptr<DataType> element_type = fixed_size_list(typ, element_size); + std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, buffer}; + auto arr_data = std::make_shared<ArrayData>(typ, values.size(), buffers); + auto arr = std::make_shared<Int64Array>(arr_data); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); + + ASSERT_OK_AND_ASSIGN( + auto expected_tensor, + Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); + const auto ext_type = + fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); + + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + const auto tensor_array = std::static_pointer_cast<FixedShapeTensorArray>(ext_arr); + ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); + + ASSERT_EQ(actual_tensor->type(), expected_tensor->type()); + ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape()); + ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides()); + ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names()); + ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data())); + ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); +} + +TEST_F(TestExtensionType, ToTensor) { + std::vector<float> values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + + auto element_sizes = std::vector<int32_t>{6, 6, 18, 18, 18, 18}; + + auto element_shapes = std::vector<std::vector<int64_t>>{{2, 3}, {3, 2}, {3, 6}, + {6, 3}, {3, 2, 3}, {3, 2, 3}}; + auto tensor_shapes = std::vector<std::vector<int64_t>>{ + {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; + + auto element_permutations = std::vector<std::vector<int64_t>>{ + {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}}; + auto tensor_strides_32 = + std::vector<std::vector<int64_t>>{{24, 12, 4}, {24, 4, 8}, {72, 24, 4}, + {72, 4, 12}, {72, 24, 12, 4}, {72, 4, 12, 24}}; + auto tensor_strides_64 = + std::vector<std::vector<int64_t>>{{48, 24, 8}, {48, 8, 16}, {144, 48, 8}, + {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; + + auto element_dim_names = std::vector<std::vector<std::string>>{ + {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H", "W", "C"}}; + auto tensor_dim_names = std::vector<std::vector<std::string>>{ + {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"}, + {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}}; + + for (size_t i = 0; i < element_shapes.size(); i++) { + CheckToTensor<float_t>(values, float32(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], + tensor_shapes[i], tensor_dim_names[i], tensor_strides_32[i]); + CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i], Review Comment: Readability: can we avoid having both `values` and `values_`? Also, why are you explicitly specifying `float_t` above? ########## cpp/src/arrow/extension/fixed_shape_tensor_test.cc: ########## @@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestExtensionType, GetScalar) { Review Comment: Is this test exercising anything specific to fixed_shape_tensor? `GetScalar` for extension types presumably already works, not sure this is worth testing again here. ########## cpp/src/arrow/extension/fixed_shape_tensor_test.cc: ########## @@ -308,19 +319,89 @@ TEST_F(TestExtensionType, TestFromTensorType) { auto dim_names = std::vector<std::vector<std::string>>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto cell_shapes = std::vector<std::vector<int64_t>>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto element_shapes = std::vector<std::vector<int64_t>>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector<std::vector<int64_t>>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, values, shapes[i], strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } +template <typename T> +void CheckToTensor(const std::vector<T>& values, const std::shared_ptr<DataType> typ, + const int32_t& element_size, const std::vector<int64_t>& element_shape, + const std::vector<int64_t>& element_permutation, + const std::vector<std::string>& element_dim_names, + const std::vector<int64_t>& tensor_shape, + const std::vector<std::string>& tensor_dim_names, + const std::vector<int64_t>& tensor_strides) { + auto buffer = Buffer::Wrap(values); + const std::shared_ptr<DataType> element_type = fixed_size_list(typ, element_size); + std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, buffer}; + auto arr_data = std::make_shared<ArrayData>(typ, values.size(), buffers); + auto arr = std::make_shared<Int64Array>(arr_data); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); + + ASSERT_OK_AND_ASSIGN( + auto expected_tensor, + Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); + const auto ext_type = + fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); + + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + const auto tensor_array = std::static_pointer_cast<FixedShapeTensorArray>(ext_arr); + ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); Review Comment: Could you perhaps add validation here? Something like: ```suggestion ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); ASSERT_OK(actual_tensor->Validate()); ``` ########## python/pyarrow/array.pxi: ########## @@ -3573,17 +3602,19 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ - if not obj.flags["C_CONTIGUOUS"]: - raise ValueError('The data in the numpy array need to be in a single, ' - 'C-style contiguous segment.') + + permutation = (-np.array(obj.strides)).argsort(kind='stable') + if permutation[0] != 0: + raise ValueError('First stride needs to be largest to ensure that ' + 'individual tensor data is contiguous in memory.') arrow_type = from_numpy_dtype(obj.dtype) - shape = obj.shape[1:] - size = obj.size / obj.shape[0] + shape = np.take(obj.shape, permutation) + values = np.ravel(obj, order="K") Review Comment: Note that `ravel` can make a copy of the input if it is not contiguous, is this something that we want? ########## python/pyarrow/scalar.pxi: ########## @@ -1027,6 +1027,32 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar) +cdef class FixedShapeTensorScalar(ExtensionScalar): + """ + Concrete class for fixed shape tensor extension scalar. + """ + + def to_numpy_ndarray(self): Review Comment: Well, the `Scalar` base class doesn't have a `to_numpy` method... ########## cpp/src/arrow/extension/fixed_shape_tensor.cc: ########## @@ -293,40 +335,49 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage()); - auto ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); + const auto ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::Invalid(value_type->ToString(), " is not valid data type for a tensor")); - std::vector<std::string> dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); + std::vector<int64_t> permutation = ext_type->permutation(); + if (permutation.empty()) { + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { + permutation.emplace_back(i); } - dim_names.insert(dim_names.begin(), 1, ""); - } else { - dim_names = {}; } + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { + permutation[i] += 1; Review Comment: I see, can you add a quick explanation as a comment? ########## cpp/src/arrow/extension/fixed_shape_tensor.cc: ########## @@ -293,53 +337,83 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage()); - auto ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); + const auto ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::TypeError(value_type->ToString(), " is not valid data type for a tensor")); - std::vector<std::string> dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); + std::vector<int64_t> permutation = ext_type->permutation(); + if (permutation.empty()) { + for (int64_t i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) { Review Comment: This is `std::iota` like above, right? ########## cpp/src/arrow/extension/fixed_shape_tensor_test.cc: ########## @@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestExtensionType, GetScalar) { + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + + auto expected_data = + ArrayFromJSON(element_type_, "[[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto storage_array = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + + auto sub_array = ExtensionType::WrapArray(ext_type, expected_data); + auto array = ExtensionType::WrapArray(ext_type, storage_array); + + ASSERT_OK_AND_ASSIGN(auto expected_scalar, sub_array->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto actual_scalar, array->GetScalar(1)); + + ASSERT_OK(actual_scalar->ValidateFull()); + ASSERT_TRUE(actual_scalar->type->Equals(*ext_type)); + ASSERT_TRUE(actual_scalar->is_valid); + + ASSERT_OK(expected_scalar->ValidateFull()); + ASSERT_TRUE(expected_scalar->type->Equals(*ext_type)); + ASSERT_TRUE(expected_scalar->is_valid); + + AssertTypeEqual(actual_scalar->type, ext_type); + ASSERT_TRUE(actual_scalar->Equals(*expected_scalar)); +} + +TEST_F(TestExtensionType, GetTensor) { + auto arr = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto element_values = + std::vector<std::vector<int64_t>>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; + + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); + auto exact_ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(ext_type); + auto exact_permuted_ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(permuted_ext_type); + + auto array = std::static_pointer_cast<FixedShapeTensorArray>( + ExtensionType::WrapArray(ext_type, arr)); + auto permuted_array = std::static_pointer_cast<FixedShapeTensorArray>( + ExtensionType::WrapArray(permuted_ext_type, arr)); + + for (size_t i = 0; i < element_values.size(); i++) { + // Get tensor from extension array with trivial permutation + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); + auto actual_ext_scalar = internal::checked_pointer_cast<ExtensionScalar>(scalar); + ASSERT_OK_AND_ASSIGN(auto actual_tensor, + exact_ext_type->MakeTensor(actual_ext_scalar)); + ASSERT_OK_AND_ASSIGN(auto expected_tensor, Review Comment: Can you validate the actual tensore here and below as well? ########## python/pyarrow/array.pxi: ########## @@ -3573,17 +3602,19 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ - if not obj.flags["C_CONTIGUOUS"]: - raise ValueError('The data in the numpy array need to be in a single, ' - 'C-style contiguous segment.') + + permutation = (-np.array(obj.strides)).argsort(kind='stable') + if permutation[0] != 0: + raise ValueError('First stride needs to be largest to ensure that ' + 'individual tensor data is contiguous in memory.') arrow_type = from_numpy_dtype(obj.dtype) - shape = obj.shape[1:] - size = obj.size / obj.shape[0] + shape = np.take(obj.shape, permutation) + values = np.ravel(obj, order="K") return ExtensionArray.from_storage( - fixed_shape_tensor(arrow_type, shape), - FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size) + fixed_shape_tensor(arrow_type, shape[1:], permutation=permutation[1:] - 1), + FixedSizeListArray.from_arrays(values, obj[0].size) Review Comment: Note that `obj[0].size` will fail if `obj` is a 0-element array. Instead, you could use `shape[1:].prod()`. ########## cpp/src/arrow/extension/fixed_shape_tensor_test.cc: ########## @@ -462,4 +543,121 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestExtensionType, GetScalar) { + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + + auto expected_data = + ArrayFromJSON(element_type_, "[[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto storage_array = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + + auto sub_array = ExtensionType::WrapArray(ext_type, expected_data); + auto array = ExtensionType::WrapArray(ext_type, storage_array); + + ASSERT_OK_AND_ASSIGN(auto expected_scalar, sub_array->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto actual_scalar, array->GetScalar(1)); + + ASSERT_OK(actual_scalar->ValidateFull()); + ASSERT_TRUE(actual_scalar->type->Equals(*ext_type)); + ASSERT_TRUE(actual_scalar->is_valid); + + ASSERT_OK(expected_scalar->ValidateFull()); + ASSERT_TRUE(expected_scalar->type->Equals(*ext_type)); + ASSERT_TRUE(expected_scalar->is_valid); + + AssertTypeEqual(actual_scalar->type, ext_type); + ASSERT_TRUE(actual_scalar->Equals(*expected_scalar)); +} + +TEST_F(TestExtensionType, GetTensor) { + auto arr = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto element_values = + std::vector<std::vector<int64_t>>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; + + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); + auto exact_ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(ext_type); + auto exact_permuted_ext_type = + internal::checked_pointer_cast<FixedShapeTensorType>(permuted_ext_type); + + auto array = std::static_pointer_cast<FixedShapeTensorArray>( + ExtensionType::WrapArray(ext_type, arr)); + auto permuted_array = std::static_pointer_cast<FixedShapeTensorArray>( + ExtensionType::WrapArray(permuted_ext_type, arr)); + + for (size_t i = 0; i < element_values.size(); i++) { + // Get tensor from extension array with trivial permutation + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); + auto actual_ext_scalar = internal::checked_pointer_cast<ExtensionScalar>(scalar); + ASSERT_OK_AND_ASSIGN(auto actual_tensor, + exact_ext_type->MakeTensor(actual_ext_scalar)); + ASSERT_OK_AND_ASSIGN(auto expected_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {3, 4}, {}, {"x", "y"})); + ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape()); + ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names()); + ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides()); + ASSERT_EQ(actual_tensor->strides(), std::vector<int64_t>({32, 8})); + ASSERT_EQ(expected_tensor->type(), actual_tensor->type()); + ASSERT_TRUE(expected_tensor->Equals(*actual_tensor)); + + // Get tensor from extension array with non-trivial permutation + ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {4, 3}, {8, 24}, {"y", "x"})); + ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); + ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, + exact_permuted_ext_type->MakeTensor( + internal::checked_pointer_cast<ExtensionScalar>(scalar))); + ASSERT_EQ(expected_permuted_tensor->strides(), actual_permuted_tensor->strides()); + ASSERT_EQ(expected_permuted_tensor->shape(), actual_permuted_tensor->shape()); + ASSERT_EQ(expected_permuted_tensor->dim_names(), actual_permuted_tensor->dim_names()); + ASSERT_EQ(expected_permuted_tensor->type(), actual_permuted_tensor->type()); + ASSERT_EQ(expected_permuted_tensor->is_contiguous(), + actual_permuted_tensor->is_contiguous()); + ASSERT_EQ(expected_permuted_tensor->is_column_major(), + actual_permuted_tensor->is_column_major()); + ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor)); + } + + // Test null values fail + auto element_type = fixed_size_list(int64(), 1); + auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]"); + ext_type = fixed_shape_tensor(int64(), {1}); + exact_ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(ext_type); + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + auto tensor_array = std::static_pointer_cast<FixedShapeTensorArray>(ext_arr); Review Comment: checked_pointer_cast perhaps? ########## python/pyarrow/tests/test_extension_type.py: ########## @@ -1318,39 +1318,101 @@ def test_tensor_type(): assert tensor_type.permutation is None -def test_tensor_class_methods(): - tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) - storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], - pa.list_(pa.float32(), 6)) +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_class_methods(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3]) + storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr.to_numpy_ndarray() + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type) + np.testing.assert_array_equal(arr.to_tensor(), expected) + np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) + + expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type) + result = arr[1:].to_numpy_ndarray() np.testing.assert_array_equal(result, expected) - expected = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr[:1].to_numpy_ndarray() + values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] + flat_arr = np.array(values[0], dtype=value_type) + bw = value_type.itemsize + storage = pa.array(values, pa.list_(arrow_type, 12)) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = np.array( + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + result = flat_arr.reshape(1, 2, 3, 2) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) np.testing.assert_array_equal(result, expected) - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="C") + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 3, 2, 2), + strides=(bw * 12, bw, bw * 6, bw * 2)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + assert result.type.permutation == [2, 0, 1] + assert result.type.shape == [2, 2, 3] + assert result.to_tensor().shape == (1, 3, 2, 2) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + + +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_array_from_numpy(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="C") tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) - assert tensor_array_from_numpy.type.value_type == pa.float32() + assert tensor_array_from_numpy.type.value_type == arrow_type assert tensor_array_from_numpy.type.shape == [2, 3] - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="F") - with pytest.raises(ValueError, match="C-style contiguous segment"): + arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], + dtype=value_type, order="F") + with pytest.raises(ValueError, match="First stride needs to be largest"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]) - storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) - arr = pa.ExtensionArray.from_storage(tensor_type, storage) - with pytest.raises(ValueError, match="non-permuted tensors"): - arr.to_numpy_ndarray() + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + bw = value_type.itemsize + + arr = flat_arr.reshape(1, 3, 4) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [3, 4] + assert tensor_array_from_numpy.type.permutation == [0, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [2, 2, 3] + assert tensor_array_from_numpy.type.permutation == [0, 2, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = flat_arr.reshape(1, 2, 3, 2) + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type) + expected = arr[1:] + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + np.testing.assert_array_equal(result, expected) Review Comment: Can we add a few interesting cases? * 1-dimensional ndarray * 0-dimensional ndarray (should raise an error?) * n-dimensional ndarray with shape (0, x, y) * n-dimensional ndarray with shape (x, 0, y) ########## python/pyarrow/array.pxi: ########## @@ -3519,17 +3519,42 @@ class FixedShapeTensorArray(ExtensionArray): def to_numpy_ndarray(self): Review Comment: I really think we should change `to_numpy` in this case. The current behavior does not seem useful. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org