This is an automated email from the ASF dual-hosted git repository.
mrkn pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 16e2667 ARROW-6624: [C++][Python] Add SparseTensor.ToTensor() method
16e2667 is described below
commit 16e2667ec3d4cb59e7d90488acdcb31ce6827d1e
Author: Rok <[email protected]>
AuthorDate: Sun Oct 20 14:33:12 2019 +0900
ARROW-6624: [C++][Python] Add SparseTensor.ToTensor() method
This is to solve
[ARROW-6624](https://issues.apache.org/jira/browse/ARROW-6624).
Closes #5539 from rok/ARROW-6624 and squashes the following commits:
0eb11c692 <Rok> Implementing review feedback.
89f3eca01 <Rok> Adding python interface.
e6e8dcfab <Rok> ARROW-6624 Add SparseTensor.ToTensor() method
Authored-by: Rok <[email protected]>
Signed-off-by: Kenta Murata <[email protected]>
---
cpp/src/arrow/python/numpy_convert.cc | 4 +-
cpp/src/arrow/sparse_tensor.cc | 129 +++++++++++++++++++++++++++++
cpp/src/arrow/sparse_tensor.h | 9 ++
cpp/src/arrow/sparse_tensor_test.cc | 29 +++++++
python/pyarrow/includes/libarrow.pxd | 2 +
python/pyarrow/tensor.pxi | 30 ++++++-
python/pyarrow/tests/test_sparse_tensor.py | 21 +++++
7 files changed, 218 insertions(+), 6 deletions(-)
diff --git a/cpp/src/arrow/python/numpy_convert.cc
b/cpp/src/arrow/python/numpy_convert.cc
index 6c1f3d7..792f47d 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -324,7 +324,7 @@ Status SparseCOOTensorToNdarray(const
std::shared_ptr<SparseCOOTensor>& sparse_t
// Wrap tensor data
OwnedRef result_data;
RETURN_NOT_OK(SparseTensorDataToNdarray(
- *sparse_tensor, {sparse_index.non_zero_length(), 1}, base,
result_data.ref()));
+ *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base,
result_data.ref()));
// Wrap indices
PyObject* result_coords;
@@ -344,7 +344,7 @@ Status SparseCSRMatrixToNdarray(const
std::shared_ptr<SparseCSRMatrix>& sparse_t
// Wrap tensor data
OwnedRef result_data;
RETURN_NOT_OK(SparseTensorDataToNdarray(
- *sparse_tensor, {sparse_index.non_zero_length(), 1}, base,
result_data.ref()));
+ *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base,
result_data.ref()));
// Wrap indices
OwnedRef result_indptr;
diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc
index b6fe2f3..3fd7008 100644
--- a/cpp/src/arrow/sparse_tensor.cc
+++ b/cpp/src/arrow/sparse_tensor.cc
@@ -364,6 +364,131 @@ void MakeSparseTensorFromTensor(const Tensor& tensor,
}
}
+template <typename TYPE, typename IndexValueType>
+Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor*
sparse_tensor,
+ std::shared_ptr<Tensor>* out) {
+ using c_index_value_type = typename IndexValueType::c_type;
+ using NumericTensorType = NumericTensor<TYPE>;
+ using value_type = typename NumericTensorType::value_type;
+
+ std::shared_ptr<Buffer> values_buffer;
+ RETURN_NOT_OK(
+ AllocateBuffer(pool, sizeof(value_type) * sparse_tensor->size(),
&values_buffer));
+ auto values = reinterpret_cast<value_type*>(values_buffer->mutable_data());
+
+ std::fill_n(values, sparse_tensor->size(), static_cast<value_type>(0));
+
+ switch (sparse_tensor->format_id()) {
+ case SparseTensorFormat::COO: {
+ const auto& sparse_index =
+ internal::checked_cast<const
SparseCOOIndex&>(*sparse_tensor->sparse_index());
+ const std::shared_ptr<const Tensor> coords = sparse_index.indices();
+ const auto raw_data =
+ reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
+ std::vector<int64_t> strides(sparse_tensor->ndim(), 1);
+
+ for (int i = sparse_tensor->ndim() - 1; i > 0; --i) {
+ strides[i - 1] *= strides[i] * sparse_tensor->shape()[i];
+ }
+ for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
+ std::vector<c_index_value_type> coord(sparse_tensor->ndim());
+ int64_t offset = 0;
+ for (int64_t j = 0; j < static_cast<int>(coord.size()); ++j) {
+ coord[j] = coords->Value<IndexValueType>({i, j});
+ offset += coord[j] * strides[j];
+ }
+ values[offset] = raw_data[i];
+ }
+ *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
+ sparse_tensor->shape());
+ return Status::OK();
+ }
+
+ case SparseTensorFormat::CSR: {
+ const auto& sparse_index =
+ internal::checked_cast<const
SparseCSRIndex&>(*sparse_tensor->sparse_index());
+ const std::shared_ptr<const Tensor> indptr = sparse_index.indptr();
+ const std::shared_ptr<const Tensor> indices = sparse_index.indices();
+ const auto raw_data =
+ reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
+
+ int64_t offset;
+ for (int64_t i = 0; i < indptr->size() - 1; ++i) {
+ const int64_t start = indptr->Value<IndexValueType>({i});
+ const int64_t stop = indptr->Value<IndexValueType>({i + 1});
+ for (int64_t j = start; j < stop; ++j) {
+ offset = indices->Value<IndexValueType>({j}) + i *
sparse_tensor->shape()[1];
+ values[offset] = raw_data[j];
+ }
+ }
+ *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
+ sparse_tensor->shape());
+ return Status::OK();
+ }
+ }
+ return Status::NotImplemented("Unsupported SparseIndex format type");
+}
+
+#define MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE(IndexValueType)
\
+ case IndexValueType##Type::type_id:
\
+ return MakeTensorFromSparseTensor<TYPE, IndexValueType##Type>(pool,
sparse_tensor, \
+ out);
\
+ break;
+
+template <typename TYPE>
+Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor*
sparse_tensor,
+ std::shared_ptr<Tensor>* out) {
+ std::shared_ptr<DataType> type;
+ switch (sparse_tensor->format_id()) {
+ case SparseTensorFormat::COO: {
+ const auto& sparse_index =
+ internal::checked_cast<const
SparseCOOIndex&>(*sparse_tensor->sparse_index());
+ const std::shared_ptr<const Tensor> indices = sparse_index.indices();
+ type = indices->type();
+ break;
+ }
+ case SparseTensorFormat::CSR: {
+ const auto& sparse_index =
+ internal::checked_cast<const
SparseCSRIndex&>(*sparse_tensor->sparse_index());
+ const std::shared_ptr<const Tensor> indices = sparse_index.indices();
+ type = indices->type();
+ break;
+ }
+ // LCOV_EXCL_START: ignore program failure
+ default:
+ ARROW_LOG(FATAL) << "Unsupported SparseIndex format";
+ break;
+ // LCOV_EXCL_STOP
+ }
+
+ switch (type->id()) {
+
ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE);
+ // LCOV_EXCL_START: ignore program failure
+ default:
+ ARROW_LOG(FATAL) << "Unsupported SparseIndex value type";
+ return Status::NotImplemented("Unsupported SparseIndex value type");
+ // LCOV_EXCL_STOP
+ }
+}
+#undef MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE
+
+#define MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE(TYPE) \
+ case TYPE##Type::type_id: \
+ return MakeTensorFromSparseTensor<TYPE##Type>(pool, sparse_tensor, out);
+
+Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor*
sparse_tensor,
+ std::shared_ptr<Tensor>* out) {
+ switch (sparse_tensor->type()->id()) {
+
ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE);
+ // LCOV_EXCL_START: ignore program failure
+ default:
+ ARROW_LOG(FATAL) << "Unsupported SparseTensor value type";
+ return Status::NotImplemented("Unsupported SparseTensor data value
type");
+ // LCOV_EXCL_STOP
+ }
+}
+#undef MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE
+
} // namespace internal
// ----------------------------------------------------------------------
@@ -429,4 +554,8 @@ bool SparseTensor::Equals(const SparseTensor& other) const {
return SparseTensorEquals(*this, other);
}
+Status SparseTensor::ToTensor(MemoryPool* pool, std::shared_ptr<Tensor>* out)
const {
+ return internal::MakeTensorFromSparseTensor(pool, this, out);
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h
index 47df011..d24a680 100644
--- a/cpp/src/arrow/sparse_tensor.h
+++ b/cpp/src/arrow/sparse_tensor.h
@@ -204,6 +204,15 @@ class ARROW_EXPORT SparseTensor {
/// \brief Return whether sparse tensors are equal
bool Equals(const SparseTensor& other) const;
+ /// \brief Return dense representation of sparse tensor as tensor
+ Status ToTensor(std::shared_ptr<Tensor>* out) const {
+ return ToTensor(default_memory_pool(), out);
+ }
+
+ /// \brief Return dense representation of sparse tensor as tensor
+ /// using specified memory pool
+ Status ToTensor(MemoryPool* pool, std::shared_ptr<Tensor>* out) const;
+
protected:
// Constructor with all attributes
SparseTensor(const std::shared_ptr<DataType>& type, const
std::shared_ptr<Buffer>& data,
diff --git a/cpp/src/arrow/sparse_tensor_test.cc
b/cpp/src/arrow/sparse_tensor_test.cc
index e37f3e4..5fcae47 100644
--- a/cpp/src/arrow/sparse_tensor_test.cc
+++ b/cpp/src/arrow/sparse_tensor_test.cc
@@ -202,6 +202,21 @@ TEST_F(TestSparseCOOTensor, TensorEquality) {
ASSERT_FALSE(st1.Equals(st2));
}
+TEST_F(TestSparseCOOTensor, TestToTensor) {
+ std::vector<int64_t> values = {1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+ std::vector<int64_t> shape({4, 3, 2, 1});
+ std::shared_ptr<Buffer> buffer = Buffer::Wrap(values);
+ Tensor tensor(int64(), buffer, shape, {}, this->dim_names_);
+ SparseTensorImpl<SparseCOOIndex> sparse_tensor(tensor);
+
+ ASSERT_EQ(5, sparse_tensor.non_zero_length());
+ ASSERT_TRUE(sparse_tensor.is_mutable());
+ std::shared_ptr<Tensor> dense_tensor;
+ ASSERT_OK(sparse_tensor.ToTensor(&dense_tensor));
+ ASSERT_TRUE(tensor.Equals(*dense_tensor));
+}
+
template <typename IndexValueType>
class TestSparseCOOTensorForIndexValueType
: public TestSparseCOOTensorBase<IndexValueType> {
@@ -469,4 +484,18 @@ TEST_F(TestSparseCSRMatrix, TensorEquality) {
ASSERT_FALSE(st1.Equals(st2));
}
+TEST_F(TestSparseCSRMatrix, TestToTensor) {
+ std::vector<int64_t> values = {1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1,
+ 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1};
+ std::vector<int64_t> shape({6, 4});
+ std::shared_ptr<Buffer> buffer = Buffer::Wrap(values);
+ Tensor tensor(int64(), buffer, shape, {}, this->dim_names_);
+ SparseTensorImpl<SparseCSRIndex> sparse_tensor(tensor);
+
+ ASSERT_EQ(7, sparse_tensor.non_zero_length());
+ ASSERT_TRUE(sparse_tensor.is_mutable());
+ std::shared_ptr<Tensor> dense_tensor;
+ ASSERT_OK(sparse_tensor.ToTensor(&dense_tensor));
+ ASSERT_TRUE(tensor.Equals(*dense_tensor));
+}
} // namespace arrow
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index fd130f8..dc29c10 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -663,6 +663,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CSparseCOOTensor" arrow::SparseCOOTensor":
shared_ptr[CDataType] type()
shared_ptr[CBuffer] data()
+ CStatus ToTensor(shared_ptr[CTensor]*)
const vector[int64_t]& shape()
int64_t size()
@@ -679,6 +680,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CSparseCSRMatrix" arrow::SparseCSRMatrix":
shared_ptr[CDataType] type()
shared_ptr[CBuffer] data()
+ CStatus ToTensor(shared_ptr[CTensor]*)
const vector[int64_t]& shape()
int64_t size()
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index fb2c3c0..4b93676 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -175,7 +175,8 @@ shape: {0.shape}""".format(self)
"SparseCOOTensor indices")
check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(),
- data, coords, c_shape, c_dim_names, &csparse_tensor))
+ data, coords, c_shape,
+ c_dim_names, &csparse_tensor))
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor)
@staticmethod
@@ -202,6 +203,16 @@ shape: {0.shape}""".format(self)
&out_data, &out_coords))
return PyObject_to_object(out_data), PyObject_to_object(out_coords)
+ def to_tensor(self):
+ """
+ Convert arrow::SparseCOOTensor to arrow::Tensor
+ """
+
+ cdef shared_ptr[CTensor] ctensor
+ check_status(self.stp.ToTensor(&ctensor))
+
+ return pyarrow_wrap_tensor(ctensor)
+
def equals(self, SparseCOOTensor other):
"""
Return true if sparse tensors contains exactly equal data
@@ -296,8 +307,8 @@ shape: {0.shape}""".format(self)
"SparseCSRMatrix indices")
check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(),
- data, indptr, indices, c_shape, c_dim_names,
- &csparse_tensor))
+ data, indptr, indices, c_shape,
+ c_dim_names, &csparse_tensor))
return pyarrow_wrap_sparse_csr_matrix(csparse_tensor)
@staticmethod
@@ -322,10 +333,21 @@ shape: {0.shape}""".format(self)
cdef PyObject* out_indices
check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self,
- &out_data, &out_indptr, &out_indices))
+ &out_data, &out_indptr,
+ &out_indices))
return (PyObject_to_object(out_data), PyObject_to_object(out_indptr),
PyObject_to_object(out_indices))
+ def to_tensor(self):
+ """
+ Convert arrow::SparseCSRMatrix to arrow::Tensor
+ """
+
+ cdef shared_ptr[CTensor] ctensor
+ check_status(self.stp.ToTensor(&ctensor))
+
+ return pyarrow_wrap_tensor(ctensor)
+
def equals(self, SparseCSRMatrix other):
"""
Return true if sparse tensors contains exactly equal data
diff --git a/python/pyarrow/tests/test_sparse_tensor.py
b/python/pyarrow/tests/test_sparse_tensor.py
index 225bbbf..aaf0468 100644
--- a/python/pyarrow/tests/test_sparse_tensor.py
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -219,3 +219,24 @@ def test_sparse_tensor_csr_numpy_roundtrip(dtype_str,
arrow_type):
assert np.array_equal(indptr, result_indptr)
assert np.array_equal(indices, result_indices)
assert sparse_tensor.dim_names == dim_names
+
+
[email protected]('sparse_tensor_type', [
+ pa.SparseCSRMatrix,
+ pa.SparseCOOTensor,
+])
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
+def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):
+ dtype = np.dtype(dtype_str)
+ array = np.array([[4, 0, 9, 0],
+ [0, 7, 0, 0],
+ [0, 0, 0, 0],
+ [0, 0, 0, 5]]).astype(dtype)
+
+ sparse_tensor = sparse_tensor_type.from_dense_numpy(array)
+ tensor = sparse_tensor.to_tensor()
+ result_array = tensor.to_numpy()
+
+ assert sparse_tensor.type == arrow_type
+ assert tensor.type == arrow_type
+ assert np.array_equal(array, result_array)