Re: [PR] GH-47520: [C++][Tensor] Correct sparse tensor creation from dense tensor with negative zero [arrow]

via GitHub Sat, 18 Oct 2025 01:50:01 -0700


pitrou commented on code in PR #47586:
URL: https://github.com/apache/arrow/pull/47586#discussion_r2410104271



##########
cpp/src/arrow/tensor/converter.h:
##########
@@ -20,6 +20,9 @@
 #include "arrow/sparse_tensor.h"  // IWYU pragma: export
 
 #include <memory>
+#include <utility>
+
+#include "arrow/visit_type_inline.h"

Review Comment:
   Please, let's avoid including this here as it will transitively include many 
other files.



##########
cpp/src/arrow/tensor/converter.h:
##########
@@ -63,5 +66,56 @@ Result<std::shared_ptr<Tensor>> 
MakeTensorFromSparseCSCMatrix(
 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
     MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
 
+template <typename Converter>
+struct ConverterVisitor {

Review Comment:
   Why not move all this into `converter_internal.h` instead?



##########
cpp/src/arrow/tensor/coo_converter.cc:
##########
@@ -25,46 +25,48 @@
 
 #include "arrow/buffer.h"
 #include "arrow/status.h"
+#include "arrow/tensor.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/macros.h"
-#include "arrow/visit_type_inline.h"
 
 namespace arrow {
 
 class MemoryPool;
 
 namespace internal {
+
 namespace {
 
-template <typename c_index_type>
-inline void IncrementRowMajorIndex(std::vector<c_index_type>& coord,
+template <typename IndexCType>
+inline void IncrementRowMajorIndex(std::vector<IndexCType>& coord,

Review Comment:
   Please let's avoid mutable refs
   ```suggestion
   inline void IncrementRowMajorIndex(const std::vector<IndexCType>& coord,
   ```



##########
cpp/src/arrow/tensor/csf_converter.cc:
##########
@@ -57,85 +57,74 @@ inline void IncrementIndex(std::vector<int64_t>& coord, 
const std::vector<int64_
 // ----------------------------------------------------------------------
 // SparseTensorConverter for SparseCSFIndex
 
-class SparseCSFTensorConverter : private SparseTensorConverterMixin {
-  using SparseTensorConverterMixin::AssignIndex;
-  using SparseTensorConverterMixin::IsNonZero;
-
+class SparseCSFTensorConverter {
  public:
   SparseCSFTensorConverter(const Tensor& tensor,
                            const std::shared_ptr<DataType>& index_value_type,
                            MemoryPool* pool)
       : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
 
-  Status Convert() {
+  template <typename ValueType, typename IndexType>
+  Status Convert(const ValueType&, const IndexType&) {
+    using ValueCType = typename ValueType::c_type;
+    using IndexCType = typename IndexType::c_type;
     
RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
                                                                   
tensor_.shape()));
+    const int64_t ndim = tensor_.ndim();
+    if (ndim <= 1) {
+      return Status::NotImplemented("TODO for ndim <= 1");
+    }
 
-    const int index_elsize = index_value_type_->byte_width();
     const int value_elsize = tensor_.type()->byte_width();
-
-    const int64_t ndim = tensor_.ndim();
     // Axis order as ascending order of dimension size is a good heuristic but 
is not
     // necessarily optimal.
     std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
     ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
 
     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           AllocateBuffer(value_elsize * nonzero_count, pool_));
-    auto* values = values_buffer->mutable_data();
 
     std::vector<int64_t> counts(ndim, 0);
     std::vector<int64_t> coord(ndim, 0);
     std::vector<int64_t> previous_coord(ndim, -1);
-    std::vector<BufferBuilder> indptr_buffer_builders(ndim - 1);
-    std::vector<BufferBuilder> indices_buffer_builders(ndim);
 
-    const auto* tensor_data = tensor_.raw_data();
-    uint8_t index_buffer[sizeof(int64_t)];
+    std::vector<TypedBufferBuilder<IndexCType>> indptr_buffer_builders(ndim - 
1);
+    std::vector<TypedBufferBuilder<IndexCType>> indices_buffer_builders(ndim);
 
-    if (ndim <= 1) {
-      return Status::NotImplemented("TODO for ndim <= 1");
-    } else {
-      const auto& shape = tensor_.shape();
-      for (int64_t n = tensor_.size(); n > 0; n--) {
-        const auto offset = tensor_.CalculateValueOffset(coord);
-        const auto xp = tensor_data + offset;
-
-        if (std::any_of(xp, xp + value_elsize, IsNonZero)) {
-          bool tree_split = false;
-
-          std::copy_n(xp, value_elsize, values);
-          values += value_elsize;
-
-          for (int64_t i = 0; i < ndim; ++i) {
-            int64_t dimension = axis_order[i];
+    auto* values = values_buffer->mutable_data_as<ValueCType>();
 
-            tree_split = tree_split || (coord[dimension] != 
previous_coord[dimension]);
-            if (tree_split) {
-              if (i < ndim - 1) {
-                AssignIndex(index_buffer, counts[i + 1], index_elsize);
-                RETURN_NOT_OK(
-                    indptr_buffer_builders[i].Append(index_buffer, 
index_elsize));
-              }
+    const auto& shape = tensor_.shape();
+    for (int64_t n = tensor_.size(); n > 0; n--) {
+      const auto value = tensor_.Value<ValueType>(coord);
 
-              AssignIndex(index_buffer, coord[dimension], index_elsize);
-              RETURN_NOT_OK(
-                  indices_buffer_builders[i].Append(index_buffer, 
index_elsize));
+      if (is_not_zero<ValueType>(value)) {
+        bool tree_split = false;
+        *values++ = value;
+        for (int64_t i = 0; i < ndim; ++i) {
+          int64_t dimension = axis_order[i];
 
-              ++counts[i];
+          tree_split = tree_split || (coord[dimension] != 
previous_coord[dimension]);
+          if (tree_split) {
+            if (i < ndim - 1) {
+              RETURN_NOT_OK(indptr_buffer_builders[i].Append(
+                  static_cast<IndexCType>(counts[i + 1])));
             }
-          }
+            RETURN_NOT_OK(indices_buffer_builders[i].Append(
+                static_cast<IndexCType>(coord[dimension])));
 
-          previous_coord = coord;
+            ++counts[i];
+          }
         }
 
-        IncrementIndex(coord, shape, axis_order);
+        previous_coord = coord;
       }
+
+      IncrementIndex(coord, shape, axis_order);
     }

Review Comment:
   Can we check that `values` and `nonzero_count` are consistent at the end 
here?



##########
cpp/src/arrow/tensor/csx_converter.cc:
##########
@@ -70,47 +74,34 @@ class SparseCSXMatrixConverter : private 
SparseTensorConverterMixin {
 
     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           AllocateBuffer(value_elsize * nonzero_count, pool_));
-    auto* values = values_buffer->mutable_data();
-
-    const auto* tensor_data = tensor_.raw_data();
-
-    if (ndim <= 1) {
-      return Status::NotImplemented("TODO for ndim <= 1");
-    } else {
-      ARROW_ASSIGN_OR_RAISE(indptr_buffer,
-                            AllocateBuffer(index_elsize * (n_major + 1), 
pool_));
-      auto* indptr = indptr_buffer->mutable_data();
-
-      ARROW_ASSIGN_OR_RAISE(indices_buffer,
-                            AllocateBuffer(index_elsize * nonzero_count, 
pool_));
-      auto* indices = indices_buffer->mutable_data();
-
-      std::vector<int64_t> coords(2);
-      int64_t k = 0;
-      std::fill_n(indptr, index_elsize, 0);
-      indptr += index_elsize;
-      for (int64_t i = 0; i < n_major; ++i) {
-        for (int64_t j = 0; j < n_minor; ++j) {
-          if (axis_ == SparseMatrixCompressedAxis::ROW) {
-            coords = {i, j};
-          } else {
-            coords = {j, i};
-          }
-          const int64_t offset = tensor_.CalculateValueOffset(coords);
-          if (std::any_of(tensor_data + offset, tensor_data + offset + 
value_elsize,
-                          IsNonZero)) {
-            std::copy_n(tensor_data + offset, value_elsize, values);
-            values += value_elsize;
-
-            AssignIndex(indices, j, index_elsize);
-            indices += index_elsize;
-
-            k++;
-          }
+    ARROW_ASSIGN_OR_RAISE(indptr_buffer,
+                          AllocateBuffer(index_elsize * (n_major + 1), pool_));
+    ARROW_ASSIGN_OR_RAISE(indices_buffer,
+                          AllocateBuffer(index_elsize * nonzero_count, pool_));
+
+    auto* indptr = indptr_buffer->mutable_data_as<IndexCType>();
+    auto* values = values_buffer->mutable_data_as<ValueCType>();
+    auto* indices = indices_buffer->mutable_data_as<IndexCType>();
+
+    std::vector<int64_t> coords(2);
+    int64_t k = 0;
+    indptr[0] = 0;
+    ++indptr;
+    for (int64_t i = 0; i < n_major; ++i) {
+      for (int64_t j = 0; j < n_minor; ++j) {
+        if (axis_ == SparseMatrixCompressedAxis::ROW) {
+          coords = {i, j};
+        } else {
+          coords = {j, i};
+        }
+        auto value = tensor_.Value<ValueType>(coords);
+        if (is_not_zero<ValueType>(value)) {
+          *values++ = value;
+          *indices++ = static_cast<IndexCType>(j);
+          k++;
         }
-        AssignIndex(indptr, k, index_elsize);
-        indptr += index_elsize;
       }
+      *indptr++ = static_cast<IndexCType>(k);
     }

Review Comment:
   Same: can we check that `values`, `indices` and `nonzero_count` are 
consistent at the end here?



##########
cpp/src/arrow/tensor/converter.h:
##########
@@ -63,5 +66,56 @@ Result<std::shared_ptr<Tensor>> 
MakeTensorFromSparseCSCMatrix(
 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
     MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
 
+template <typename Converter>
+struct ConverterVisitor {
+  explicit ConverterVisitor(Converter& converter) : converter(converter) {}
+  template <typename ValueType, typename IndexType>
+  Status operator()(const ValueType& value, const IndexType& index_type) {
+    return converter.Convert(value, index_type);
+  }
+
+  Converter& converter;
+};
+
+struct ValueTypeVisitor {
+  template <typename ValueType, typename IndexType, typename Function>
+  enable_if_number<ValueType, Status> Visit(const ValueType& value_type,
+                                            const IndexType& index_type,
+                                            Function&& function) {
+    return function(value_type, index_type);
+  }
+
+  template <typename IndexType, typename Function>
+  Status Visit(const DataType& value_type, const IndexType&, Function&&) {
+    return Status::Invalid("Invalid value type: ", value_type.name(),
+                           ". Expected a number.");
+  }
+};
+
+struct IndexAndValueTypeVisitor {
+  template <typename IndexType, typename Function>
+  enable_if_integer<IndexType, Status> Visit(const IndexType& index_type,
+                                             const std::shared_ptr<DataType>& 
value_type,
+                                             Function&& function) {
+    ValueTypeVisitor visitor;
+    return VisitTypeInline(*value_type, &visitor, index_type,
+                           std::forward<Function>(function));
+  }
+
+  template <typename Function>
+  Status Visit(const DataType& type, const std::shared_ptr<DataType>&, 
Function&&) {
+    return Status::Invalid("Invalid index type: ", type.name(), ". Expected 
integer.");
+  }
+};
+
+template <typename Function>
+Status VisitValueAndIndexType(const std::shared_ptr<DataType>& value_type,
+                              const std::shared_ptr<DataType>& index_type,

Review Comment:
   Why not pass both `const DataType&` here? We're not storing the types anyway.



##########
cpp/src/arrow/tensor/csf_converter.cc:
##########
@@ -57,85 +57,74 @@ inline void IncrementIndex(std::vector<int64_t>& coord, 
const std::vector<int64_
 // ----------------------------------------------------------------------
 // SparseTensorConverter for SparseCSFIndex
 
-class SparseCSFTensorConverter : private SparseTensorConverterMixin {
-  using SparseTensorConverterMixin::AssignIndex;
-  using SparseTensorConverterMixin::IsNonZero;
-
+class SparseCSFTensorConverter {
  public:
   SparseCSFTensorConverter(const Tensor& tensor,
                            const std::shared_ptr<DataType>& index_value_type,
                            MemoryPool* pool)
       : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
 
-  Status Convert() {
+  template <typename ValueType, typename IndexType>
+  Status Convert(const ValueType&, const IndexType&) {
+    using ValueCType = typename ValueType::c_type;
+    using IndexCType = typename IndexType::c_type;
     
RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
                                                                   
tensor_.shape()));
+    const int64_t ndim = tensor_.ndim();
+    if (ndim <= 1) {
+      return Status::NotImplemented("TODO for ndim <= 1");
+    }
 
-    const int index_elsize = index_value_type_->byte_width();
     const int value_elsize = tensor_.type()->byte_width();
-
-    const int64_t ndim = tensor_.ndim();
     // Axis order as ascending order of dimension size is a good heuristic but 
is not
     // necessarily optimal.
     std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
     ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
 
     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           AllocateBuffer(value_elsize * nonzero_count, pool_));
-    auto* values = values_buffer->mutable_data();
 
     std::vector<int64_t> counts(ndim, 0);
     std::vector<int64_t> coord(ndim, 0);
     std::vector<int64_t> previous_coord(ndim, -1);
-    std::vector<BufferBuilder> indptr_buffer_builders(ndim - 1);
-    std::vector<BufferBuilder> indices_buffer_builders(ndim);
 
-    const auto* tensor_data = tensor_.raw_data();
-    uint8_t index_buffer[sizeof(int64_t)];
+    std::vector<TypedBufferBuilder<IndexCType>> indptr_buffer_builders(ndim - 
1);

Review Comment:
   Ideally these should also use the user-passed MemoryPool. This is not 
necessary for this PR though.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47520: [C++][Tensor] Correct sparse tensor creation from dense tensor with negative zero [arrow]

Reply via email to