[arrow] branch master updated: ARROW-6624: [C++][Python] Add SparseTensor.ToTensor() method

mrkn Sat, 19 Oct 2019 22:34:09 -0700

This is an automated email from the ASF dual-hosted git repository.

mrkn pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 16e2667  ARROW-6624: [C++][Python] Add SparseTensor.ToTensor() method
16e2667 is described below

commit 16e2667ec3d4cb59e7d90488acdcb31ce6827d1e
Author: Rok <[email protected]>
AuthorDate: Sun Oct 20 14:33:12 2019 +0900

    ARROW-6624: [C++][Python] Add SparseTensor.ToTensor() method
    
    This is to solve 
[ARROW-6624](https://issues.apache.org/jira/browse/ARROW-6624).
    
    Closes #5539 from rok/ARROW-6624 and squashes the following commits:
    
    0eb11c692 <Rok> Implementing review feedback.
    89f3eca01 <Rok> Adding python interface.
    e6e8dcfab <Rok> ARROW-6624  Add SparseTensor.ToTensor() method
    
    Authored-by: Rok <[email protected]>
    Signed-off-by: Kenta Murata <[email protected]>
---
 cpp/src/arrow/python/numpy_convert.cc      |   4 +-
 cpp/src/arrow/sparse_tensor.cc             | 129 +++++++++++++++++++++++++++++
 cpp/src/arrow/sparse_tensor.h              |   9 ++
 cpp/src/arrow/sparse_tensor_test.cc        |  29 +++++++
 python/pyarrow/includes/libarrow.pxd       |   2 +
 python/pyarrow/tensor.pxi                  |  30 ++++++-
 python/pyarrow/tests/test_sparse_tensor.py |  21 +++++
 7 files changed, 218 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_convert.cc 
b/cpp/src/arrow/python/numpy_convert.cc
index 6c1f3d7..792f47d 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -324,7 +324,7 @@ Status SparseCOOTensorToNdarray(const 
std::shared_ptr<SparseCOOTensor>& sparse_t
   // Wrap tensor data
   OwnedRef result_data;
   RETURN_NOT_OK(SparseTensorDataToNdarray(
-      *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, 
result_data.ref()));
+      *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, 
result_data.ref()));
 
   // Wrap indices
   PyObject* result_coords;
@@ -344,7 +344,7 @@ Status SparseCSRMatrixToNdarray(const 
std::shared_ptr<SparseCSRMatrix>& sparse_t
   // Wrap tensor data
   OwnedRef result_data;
   RETURN_NOT_OK(SparseTensorDataToNdarray(
-      *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, 
result_data.ref()));
+      *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, 
result_data.ref()));
 
   // Wrap indices
   OwnedRef result_indptr;
diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc
index b6fe2f3..3fd7008 100644
--- a/cpp/src/arrow/sparse_tensor.cc
+++ b/cpp/src/arrow/sparse_tensor.cc
@@ -364,6 +364,131 @@ void MakeSparseTensorFromTensor(const Tensor& tensor,
   }
 }
 
+template <typename TYPE, typename IndexValueType>
+Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* 
sparse_tensor,
+                                  std::shared_ptr<Tensor>* out) {
+  using c_index_value_type = typename IndexValueType::c_type;
+  using NumericTensorType = NumericTensor<TYPE>;
+  using value_type = typename NumericTensorType::value_type;
+
+  std::shared_ptr<Buffer> values_buffer;
+  RETURN_NOT_OK(
+      AllocateBuffer(pool, sizeof(value_type) * sparse_tensor->size(), 
&values_buffer));
+  auto values = reinterpret_cast<value_type*>(values_buffer->mutable_data());
+
+  std::fill_n(values, sparse_tensor->size(), static_cast<value_type>(0));
+
+  switch (sparse_tensor->format_id()) {
+    case SparseTensorFormat::COO: {
+      const auto& sparse_index =
+          internal::checked_cast<const 
SparseCOOIndex&>(*sparse_tensor->sparse_index());
+      const std::shared_ptr<const Tensor> coords = sparse_index.indices();
+      const auto raw_data =
+          reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
+      std::vector<int64_t> strides(sparse_tensor->ndim(), 1);
+
+      for (int i = sparse_tensor->ndim() - 1; i > 0; --i) {
+        strides[i - 1] *= strides[i] * sparse_tensor->shape()[i];
+      }
+      for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
+        std::vector<c_index_value_type> coord(sparse_tensor->ndim());
+        int64_t offset = 0;
+        for (int64_t j = 0; j < static_cast<int>(coord.size()); ++j) {
+          coord[j] = coords->Value<IndexValueType>({i, j});
+          offset += coord[j] * strides[j];
+        }
+        values[offset] = raw_data[i];
+      }
+      *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
+                                      sparse_tensor->shape());
+      return Status::OK();
+    }
+
+    case SparseTensorFormat::CSR: {
+      const auto& sparse_index =
+          internal::checked_cast<const 
SparseCSRIndex&>(*sparse_tensor->sparse_index());
+      const std::shared_ptr<const Tensor> indptr = sparse_index.indptr();
+      const std::shared_ptr<const Tensor> indices = sparse_index.indices();
+      const auto raw_data =
+          reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
+
+      int64_t offset;
+      for (int64_t i = 0; i < indptr->size() - 1; ++i) {
+        const int64_t start = indptr->Value<IndexValueType>({i});
+        const int64_t stop = indptr->Value<IndexValueType>({i + 1});
+        for (int64_t j = start; j < stop; ++j) {
+          offset = indices->Value<IndexValueType>({j}) + i * 
sparse_tensor->shape()[1];
+          values[offset] = raw_data[j];
+        }
+      }
+      *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
+                                      sparse_tensor->shape());
+      return Status::OK();
+    }
+  }
+  return Status::NotImplemented("Unsupported SparseIndex format type");
+}
+
+#define MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE(IndexValueType)              
        \
+  case IndexValueType##Type::type_id:                                          
        \
+    return MakeTensorFromSparseTensor<TYPE, IndexValueType##Type>(pool, 
sparse_tensor, \
+                                                                  out);        
        \
+    break;
+
+template <typename TYPE>
+Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* 
sparse_tensor,
+                                  std::shared_ptr<Tensor>* out) {
+  std::shared_ptr<DataType> type;
+  switch (sparse_tensor->format_id()) {
+    case SparseTensorFormat::COO: {
+      const auto& sparse_index =
+          internal::checked_cast<const 
SparseCOOIndex&>(*sparse_tensor->sparse_index());
+      const std::shared_ptr<const Tensor> indices = sparse_index.indices();
+      type = indices->type();
+      break;
+    }
+    case SparseTensorFormat::CSR: {
+      const auto& sparse_index =
+          internal::checked_cast<const 
SparseCSRIndex&>(*sparse_tensor->sparse_index());
+      const std::shared_ptr<const Tensor> indices = sparse_index.indices();
+      type = indices->type();
+      break;
+    }
+      // LCOV_EXCL_START: ignore program failure
+    default:
+      ARROW_LOG(FATAL) << "Unsupported SparseIndex format";
+      break;
+      // LCOV_EXCL_STOP
+  }
+
+  switch (type->id()) {
+    
ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE);
+      // LCOV_EXCL_START: ignore program failure
+    default:
+      ARROW_LOG(FATAL) << "Unsupported SparseIndex value type";
+      return Status::NotImplemented("Unsupported SparseIndex value type");
+      // LCOV_EXCL_STOP
+  }
+}
+#undef MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE
+
+#define MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE(TYPE) \
+  case TYPE##Type::type_id:                             \
+    return MakeTensorFromSparseTensor<TYPE##Type>(pool, sparse_tensor, out);
+
+Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* 
sparse_tensor,
+                                  std::shared_ptr<Tensor>* out) {
+  switch (sparse_tensor->type()->id()) {
+    
ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE);
+    // LCOV_EXCL_START: ignore program failure
+    default:
+      ARROW_LOG(FATAL) << "Unsupported SparseTensor value type";
+      return Status::NotImplemented("Unsupported SparseTensor data value 
type");
+      // LCOV_EXCL_STOP
+  }
+}
+#undef MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE
+
 }  // namespace internal
 
 // ----------------------------------------------------------------------
@@ -429,4 +554,8 @@ bool SparseTensor::Equals(const SparseTensor& other) const {
   return SparseTensorEquals(*this, other);
 }
 
+Status SparseTensor::ToTensor(MemoryPool* pool, std::shared_ptr<Tensor>* out) 
const {
+  return internal::MakeTensorFromSparseTensor(pool, this, out);
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h
index 47df011..d24a680 100644
--- a/cpp/src/arrow/sparse_tensor.h
+++ b/cpp/src/arrow/sparse_tensor.h
@@ -204,6 +204,15 @@ class ARROW_EXPORT SparseTensor {
   /// \brief Return whether sparse tensors are equal
   bool Equals(const SparseTensor& other) const;
 
+  /// \brief Return dense representation of sparse tensor as tensor
+  Status ToTensor(std::shared_ptr<Tensor>* out) const {
+    return ToTensor(default_memory_pool(), out);
+  }
+
+  /// \brief Return dense representation of sparse tensor as tensor
+  /// using specified memory pool
+  Status ToTensor(MemoryPool* pool, std::shared_ptr<Tensor>* out) const;
+
  protected:
   // Constructor with all attributes
   SparseTensor(const std::shared_ptr<DataType>& type, const 
std::shared_ptr<Buffer>& data,
diff --git a/cpp/src/arrow/sparse_tensor_test.cc 
b/cpp/src/arrow/sparse_tensor_test.cc
index e37f3e4..5fcae47 100644
--- a/cpp/src/arrow/sparse_tensor_test.cc
+++ b/cpp/src/arrow/sparse_tensor_test.cc
@@ -202,6 +202,21 @@ TEST_F(TestSparseCOOTensor, TensorEquality) {
   ASSERT_FALSE(st1.Equals(st2));
 }
 
+TEST_F(TestSparseCOOTensor, TestToTensor) {
+  std::vector<int64_t> values = {1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+  std::vector<int64_t> shape({4, 3, 2, 1});
+  std::shared_ptr<Buffer> buffer = Buffer::Wrap(values);
+  Tensor tensor(int64(), buffer, shape, {}, this->dim_names_);
+  SparseTensorImpl<SparseCOOIndex> sparse_tensor(tensor);
+
+  ASSERT_EQ(5, sparse_tensor.non_zero_length());
+  ASSERT_TRUE(sparse_tensor.is_mutable());
+  std::shared_ptr<Tensor> dense_tensor;
+  ASSERT_OK(sparse_tensor.ToTensor(&dense_tensor));
+  ASSERT_TRUE(tensor.Equals(*dense_tensor));
+}
+
 template <typename IndexValueType>
 class TestSparseCOOTensorForIndexValueType
     : public TestSparseCOOTensorBase<IndexValueType> {
@@ -469,4 +484,18 @@ TEST_F(TestSparseCSRMatrix, TensorEquality) {
   ASSERT_FALSE(st1.Equals(st2));
 }
 
+TEST_F(TestSparseCSRMatrix, TestToTensor) {
+  std::vector<int64_t> values = {1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1,
+                                 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1};
+  std::vector<int64_t> shape({6, 4});
+  std::shared_ptr<Buffer> buffer = Buffer::Wrap(values);
+  Tensor tensor(int64(), buffer, shape, {}, this->dim_names_);
+  SparseTensorImpl<SparseCSRIndex> sparse_tensor(tensor);
+
+  ASSERT_EQ(7, sparse_tensor.non_zero_length());
+  ASSERT_TRUE(sparse_tensor.is_mutable());
+  std::shared_ptr<Tensor> dense_tensor;
+  ASSERT_OK(sparse_tensor.ToTensor(&dense_tensor));
+  ASSERT_TRUE(tensor.Equals(*dense_tensor));
+}
 }  // namespace arrow
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index fd130f8..dc29c10 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -663,6 +663,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CSparseCOOTensor" arrow::SparseCOOTensor":
         shared_ptr[CDataType] type()
         shared_ptr[CBuffer] data()
+        CStatus ToTensor(shared_ptr[CTensor]*)
 
         const vector[int64_t]& shape()
         int64_t size()
@@ -679,6 +680,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CSparseCSRMatrix" arrow::SparseCSRMatrix":
         shared_ptr[CDataType] type()
         shared_ptr[CBuffer] data()
+        CStatus ToTensor(shared_ptr[CTensor]*)
 
         const vector[int64_t]& shape()
         int64_t size()
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index fb2c3c0..4b93676 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -175,7 +175,8 @@ shape: {0.shape}""".format(self)
                              "SparseCOOTensor indices")
 
         check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(),
-                     data, coords, c_shape, c_dim_names, &csparse_tensor))
+                                               data, coords, c_shape,
+                                               c_dim_names, &csparse_tensor))
         return pyarrow_wrap_sparse_coo_tensor(csparse_tensor)
 
     @staticmethod
@@ -202,6 +203,16 @@ shape: {0.shape}""".format(self)
                                               &out_data, &out_coords))
         return PyObject_to_object(out_data), PyObject_to_object(out_coords)
 
+    def to_tensor(self):
+        """
+        Convert arrow::SparseCOOTensor to arrow::Tensor
+        """
+
+        cdef shared_ptr[CTensor] ctensor
+        check_status(self.stp.ToTensor(&ctensor))
+
+        return pyarrow_wrap_tensor(ctensor)
+
     def equals(self, SparseCOOTensor other):
         """
         Return true if sparse tensors contains exactly equal data
@@ -296,8 +307,8 @@ shape: {0.shape}""".format(self)
                              "SparseCSRMatrix indices")
 
         check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(),
-                     data, indptr, indices, c_shape, c_dim_names,
-                     &csparse_tensor))
+                                               data, indptr, indices, c_shape,
+                                               c_dim_names, &csparse_tensor))
         return pyarrow_wrap_sparse_csr_matrix(csparse_tensor)
 
     @staticmethod
@@ -322,10 +333,21 @@ shape: {0.shape}""".format(self)
         cdef PyObject* out_indices
 
         check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self,
-                     &out_data, &out_indptr, &out_indices))
+                                              &out_data, &out_indptr,
+                                              &out_indices))
         return (PyObject_to_object(out_data), PyObject_to_object(out_indptr),
                 PyObject_to_object(out_indices))
 
+    def to_tensor(self):
+        """
+        Convert arrow::SparseCSRMatrix to arrow::Tensor
+        """
+
+        cdef shared_ptr[CTensor] ctensor
+        check_status(self.stp.ToTensor(&ctensor))
+
+        return pyarrow_wrap_tensor(ctensor)
+
     def equals(self, SparseCSRMatrix other):
         """
         Return true if sparse tensors contains exactly equal data
diff --git a/python/pyarrow/tests/test_sparse_tensor.py 
b/python/pyarrow/tests/test_sparse_tensor.py
index 225bbbf..aaf0468 100644
--- a/python/pyarrow/tests/test_sparse_tensor.py
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -219,3 +219,24 @@ def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, 
arrow_type):
     assert np.array_equal(indptr, result_indptr)
     assert np.array_equal(indices, result_indices)
     assert sparse_tensor.dim_names == dim_names
+
+
[email protected]('sparse_tensor_type', [
+    pa.SparseCSRMatrix,
+    pa.SparseCOOTensor,
+])
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
+def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):
+    dtype = np.dtype(dtype_str)
+    array = np.array([[4, 0, 9, 0],
+                      [0, 7, 0, 0],
+                      [0, 0, 0, 0],
+                      [0, 0, 0, 5]]).astype(dtype)
+
+    sparse_tensor = sparse_tensor_type.from_dense_numpy(array)
+    tensor = sparse_tensor.to_tensor()
+    result_array = tensor.to_numpy()
+
+    assert sparse_tensor.type == arrow_type
+    assert tensor.type == arrow_type
+    assert np.array_equal(array, result_array)

[arrow] branch master updated: ARROW-6624: [C++][Python] Add SparseTensor.ToTensor() method

Reply via email to