This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 74b9294 ARROW-4453: [Python] Cython wrappers for SparseTensor
74b9294 is described below
commit 74b9294bf63ff49818f2d6a72877139a1a540f60
Author: Rok <[email protected]>
AuthorDate: Tue Jul 2 10:21:41 2019 +0200
ARROW-4453: [Python] Cython wrappers for SparseTensor
Creating cython wrappers for SparseTensor.
This is to resolve
[ARROW-4453](https://issues.apache.org/jira/browse/ARROW-4453).
Author: Rok <[email protected]>
Author: Antoine Pitrou <[email protected]>
Closes #4446 from rok/ARROW-4453 and squashes the following commits:
db5d620fe <Rok> Typo.
9e0363afe <Antoine Pitrou> Polish code
c31b8eb32 <Rok> Enabling SparseTensor.Equals checks.
654002afe <Rok> Partial review feedback implementation.
e89edc620 <Rok> Refactoring to_numpy methods.
3fcc1929e <Rok> Add equality methods.
4a30487fc <Rok> Set base object in to_numpy methods.
4eeae02d8 <Rok> Cython wrapper for SparseTensor.
---
cpp/src/arrow/compare.cc | 3 +-
cpp/src/arrow/python/numpy_convert.cc | 173 ++++++++++++--
cpp/src/arrow/python/numpy_convert.h | 29 +++
cpp/src/arrow/python/pyarrow.cc | 38 +++
cpp/src/arrow/python/pyarrow.h | 14 ++
cpp/src/arrow/python/pyarrow_api.h | 18 ++
cpp/src/arrow/python/pyarrow_lib.h | 4 +
cpp/src/arrow/python/serialize.cc | 2 +-
cpp/src/arrow/sparse_tensor-test.cc | 39 +++
docs/source/python/extending.rst | 42 ++++
python/pyarrow/__init__.pxd | 9 +-
python/pyarrow/__init__.py | 1 +
python/pyarrow/array.pxi | 98 --------
python/pyarrow/includes/libarrow.pxd | 60 +++++
python/pyarrow/lib.pxd | 30 +++
python/pyarrow/lib.pyx | 3 +
python/pyarrow/public-api.pxi | 50 +++-
python/pyarrow/tensor.pxi | 367 +++++++++++++++++++++++++++++
python/pyarrow/tests/test_sparse_tensor.py | 221 +++++++++++++++++
python/pyarrow/tests/test_tensor.py | 46 ++--
20 files changed, 1101 insertions(+), 146 deletions(-)
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 12991b9..4ae5d89 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -1026,9 +1026,8 @@ struct SparseTensorEqualsImpl<SparseIndexType,
SparseIndexType> {
const uint8_t* left_data = left.data()->data();
const uint8_t* right_data = right.data()->data();
-
return memcmp(left_data, right_data,
- static_cast<size_t>(byte_width * left.non_zero_length()));
+ static_cast<size_t>(byte_width * left.non_zero_length())) ==
0;
}
};
diff --git a/cpp/src/arrow/python/numpy_convert.cc
b/cpp/src/arrow/python/numpy_convert.cc
index f7068b3..515864a 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -25,8 +25,10 @@
#include <vector>
#include "arrow/buffer.h"
+#include "arrow/sparse_tensor.h"
#include "arrow/tensor.h"
#include "arrow/type.h"
+#include "arrow/util/logging.h"
#include "arrow/python/common.h"
#include "arrow/python/pyarrow.h"
@@ -186,7 +188,9 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr,
std::shared_ptr<DataType>* out) {
#undef TO_ARROW_TYPE_CASE
-Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
std::shared_ptr<Tensor>* out) {
+Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<Tensor>* out) {
if (!PyArray_Check(ao)) {
return Status::TypeError("Did not pass ndarray object");
}
@@ -197,35 +201,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
std::shared_ptr<Tensor>*
int ndim = PyArray_NDIM(ndarray);
- // This is also holding the GIL, so don't already draw it.
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
std::vector<int64_t> shape(ndim);
std::vector<int64_t> strides(ndim);
- {
- PyAcquireGIL lock;
- npy_intp* array_strides = PyArray_STRIDES(ndarray);
- npy_intp* array_shape = PyArray_SHAPE(ndarray);
- for (int i = 0; i < ndim; ++i) {
- if (array_strides[i] < 0) {
- return Status::Invalid("Negative ndarray strides not supported");
- }
- shape[i] = array_shape[i];
- strides[i] = array_strides[i];
+ npy_intp* array_strides = PyArray_STRIDES(ndarray);
+ npy_intp* array_shape = PyArray_SHAPE(ndarray);
+ for (int i = 0; i < ndim; ++i) {
+ if (array_strides[i] < 0) {
+ return Status::Invalid("Negative ndarray strides not supported");
}
-
- std::shared_ptr<DataType> type;
- RETURN_NOT_OK(
- GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)),
&type));
- *out = std::make_shared<Tensor>(type, data, shape, strides);
- return Status::OK();
+ shape[i] = array_shape[i];
+ strides[i] = array_strides[i];
}
+
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(
+ GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)),
&type));
+ *out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
+ return Status::OK();
}
Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
PyObject** out) {
- PyAcquireGIL lock;
-
int type_num;
RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
@@ -274,5 +272,140 @@ Status TensorToNdarray(const std::shared_ptr<Tensor>&
tensor, PyObject* base,
return Status::OK();
}
+// Wrap the dense data of a sparse tensor in a ndarray
+static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor,
+ std::vector<npy_intp> data_shape,
PyObject* base,
+ PyObject** out_data) {
+ int type_num_data;
+ RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data));
+ PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
+ RETURN_IF_PYERROR();
+
+ const void* immutable_data = sparse_tensor.data()->data();
+ // Remove const =(
+ void* mutable_data = const_cast<void*>(immutable_data);
+ int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS;
+ if (sparse_tensor.is_mutable()) {
+ array_flags |= NPY_ARRAY_WRITEABLE;
+ }
+
+ *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data,
+ static_cast<int>(data_shape.size()),
data_shape.data(),
+ nullptr, mutable_data, array_flags,
nullptr);
+ RETURN_IF_PYERROR()
+ Py_XINCREF(base);
+ PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(*out_data), base);
+ return Status::OK();
+}
+
+Status SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>&
sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_coords) {
+ const auto& sparse_index = arrow::internal::checked_cast<const
SparseCOOIndex&>(
+ *sparse_tensor->sparse_index());
+
+ // Wrap tensor data
+ OwnedRef result_data;
+ RETURN_NOT_OK(SparseTensorDataToNdarray(
+ *sparse_tensor, {sparse_index.non_zero_length(), 1}, base,
result_data.ref()));
+
+ // Wrap indices
+ PyObject* result_coords;
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords));
+
+ *out_data = result_data.detach();
+ *out_coords = result_coords;
+ return Status::OK();
+}
+
+Status SparseTensorCSRToNdarray(const std::shared_ptr<SparseTensorCSR>&
sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_indptr, PyObject** out_indices)
{
+ const auto& sparse_index = arrow::internal::checked_cast<const
SparseCSRIndex&>(
+ *sparse_tensor->sparse_index());
+
+ // Wrap tensor data
+ OwnedRef result_data;
+ RETURN_NOT_OK(SparseTensorDataToNdarray(
+ *sparse_tensor, {sparse_index.non_zero_length(), 1}, base,
result_data.ref()));
+
+ // Wrap indices
+ OwnedRef result_indptr;
+ OwnedRef result_indices;
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base,
result_indptr.ref()));
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base,
result_indices.ref()));
+
+ *out_data = result_data.detach();
+ *out_indptr = result_indptr.detach();
+ *out_indices = result_indices.detach();
+ return Status::OK();
+}
+
+Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao,
PyObject* coords_ao,
+ const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseTensorCOO>* out) {
+ if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) {
+ return Status::TypeError("Did not pass ndarray object");
+ }
+
+ PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+ std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+ std::shared_ptr<DataType> type_data;
+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
+ &type_data));
+
+ std::shared_ptr<Tensor> coords;
+ RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
+ ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by
caller
+
+ std::shared_ptr<SparseCOOIndex> sparse_index =
std::make_shared<SparseCOOIndex>(
+ std::static_pointer_cast<NumericTensor<Int64Type>>(coords));
+ *out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index,
type_data, data,
+ shape, dim_names);
+ return Status::OK();
+}
+
+Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao,
PyObject* indptr_ao,
+ PyObject* indices_ao, const
std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseTensorCSR>* out) {
+ if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) ||
+ !PyArray_Check(indices_ao)) {
+ return Status::TypeError("Did not pass ndarray object");
+ }
+
+ PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+ std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+ std::shared_ptr<DataType> type_data;
+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
+ &type_data));
+
+ std::shared_ptr<Tensor> indptr, indices;
+ RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
+ RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices));
+ ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by
caller
+ ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by
caller
+
+ auto sparse_index = std::make_shared<SparseCSRIndex>(
+ std::static_pointer_cast<NumericTensor<Int64Type>>(indptr),
+ std::static_pointer_cast<NumericTensor<Int64Type>>(indices));
+ *out = std::make_shared<SparseTensorImpl<SparseCSRIndex>>(sparse_index,
type_data, data,
+ shape, dim_names);
+ return Status::OK();
+}
+
+Status TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseTensorCOO>* out) {
+ *out = std::make_shared<SparseTensorCOO>(*tensor);
+ return Status::OK();
+}
+
+Status TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseTensorCSR>* out) {
+ *out = std::make_shared<SparseTensorCSR>(*tensor);
+ return Status::OK();
+}
+
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/numpy_convert.h
b/cpp/src/arrow/python/numpy_convert.h
index dce5fe5..5fa1326 100644
--- a/cpp/src/arrow/python/numpy_convert.h
+++ b/cpp/src/arrow/python/numpy_convert.h
@@ -25,9 +25,11 @@
#include <memory>
#include <string>
+#include <vector>
#include "arrow/buffer.h"
#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
namespace arrow {
@@ -63,11 +65,38 @@ Status GetTensorType(PyObject* dtype,
std::shared_ptr<DataType>* out);
Status GetNumPyType(const DataType& type, int* type_num);
ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+ const std::vector<std::string>&
dim_names,
std::shared_ptr<Tensor>* out);
ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>&
tensor,
PyObject* base, PyObject** out);
+ARROW_PYTHON_EXPORT Status
+SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_tensor,
+ PyObject* base, PyObject** out_data, PyObject**
out_coords);
+
+ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray(
+ const std::shared_ptr<SparseTensorCSR>& sparse_tensor, PyObject* base,
+ PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO(
+ MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
+ const std::vector<int64_t>& shape, const std::vector<std::string>&
dim_names,
+ std::shared_ptr<SparseTensorCOO>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR(
+ MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject*
indices_ao,
+ const std::vector<int64_t>& shape, const std::vector<std::string>&
dim_names,
+ std::shared_ptr<SparseTensorCSR>* out);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseTensorCOO>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseTensorCSR>* csparse_tensor);
+
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc
index 1cedc54..e037318 100644
--- a/cpp/src/arrow/python/pyarrow.cc
+++ b/cpp/src/arrow/python/pyarrow.cc
@@ -123,6 +123,44 @@ PyObject* wrap_tensor(const std::shared_ptr<Tensor>&
tensor) {
return ::pyarrow_wrap_tensor(tensor);
}
+bool is_sparse_tensor_csr(PyObject* sparse_tensor) {
+ return ::pyarrow_is_sparse_tensor_csr(sparse_tensor) != 0;
+}
+
+Status unwrap_sparse_tensor_csr(PyObject* sparse_tensor,
+ std::shared_ptr<SparseTensorCSR>* out) {
+ *out = ::pyarrow_unwrap_sparse_tensor_csr(sparse_tensor);
+ if (*out) {
+ return Status::OK();
+ } else {
+ return Status::Invalid(
+ "Could not unwrap SparseTensorCSR from the passed Python object.");
+ }
+}
+
+PyObject* wrap_sparse_tensor_csr(const std::shared_ptr<SparseTensorCSR>&
sparse_tensor) {
+ return ::pyarrow_wrap_sparse_tensor_csr(sparse_tensor);
+}
+
+bool is_sparse_tensor_coo(PyObject* sparse_tensor) {
+ return ::pyarrow_is_sparse_tensor_coo(sparse_tensor) != 0;
+}
+
+Status unwrap_sparse_tensor_coo(PyObject* sparse_tensor,
+ std::shared_ptr<SparseTensorCOO>* out) {
+ *out = ::pyarrow_unwrap_sparse_tensor_coo(sparse_tensor);
+ if (*out) {
+ return Status::OK();
+ } else {
+ return Status::Invalid(
+ "Could not unwrap SparseTensorCOO from the passed Python object.");
+ }
+}
+
+PyObject* wrap_sparse_tensor_coo(const std::shared_ptr<SparseTensorCOO>&
sparse_tensor) {
+ return ::pyarrow_wrap_sparse_tensor_coo(sparse_tensor);
+}
+
bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; }
Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out) {
diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h
index ff5bf8f..b4834f7 100644
--- a/cpp/src/arrow/python/pyarrow.h
+++ b/cpp/src/arrow/python/pyarrow.h
@@ -24,6 +24,8 @@
#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
+
namespace arrow {
class Array;
@@ -67,6 +69,18 @@ ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor);
ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor,
std::shared_ptr<Tensor>* out);
ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>&
tensor);
+ARROW_PYTHON_EXPORT bool is_sparse_tensor_coo(PyObject* sparse_tensor);
+ARROW_PYTHON_EXPORT Status
+unwrap_sparse_tensor_coo(PyObject* sparse_tensor,
std::shared_ptr<SparseTensorCOO>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_coo(
+ const std::shared_ptr<SparseTensorCOO>& sparse_tensor);
+
+ARROW_PYTHON_EXPORT bool is_sparse_tensor_csr(PyObject* sparse_tensor);
+ARROW_PYTHON_EXPORT Status
+unwrap_sparse_tensor_csr(PyObject* sparse_tensor,
std::shared_ptr<SparseTensorCSR>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_csr(
+ const std::shared_ptr<SparseTensorCSR>& sparse_tensor);
+
ARROW_PYTHON_EXPORT bool is_column(PyObject* column);
ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column,
std::shared_ptr<Column>* out);
ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>&
column);
diff --git a/cpp/src/arrow/python/pyarrow_api.h
b/cpp/src/arrow/python/pyarrow_api.h
index b76e961..2d8f71c 100644
--- a/cpp/src/arrow/python/pyarrow_api.h
+++ b/cpp/src/arrow/python/pyarrow_api.h
@@ -50,6 +50,10 @@ static PyObject
*(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr
#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table
static PyObject
*(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr<
arrow::Tensor> const &) = 0;
#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor
+static PyObject
*(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr)(std::shared_ptr<
arrow::SparseTensorCSR> const &) = 0;
+#define pyarrow_wrap_sparse_tensor_csr
__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr
+static PyObject
*(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo)(std::shared_ptr<
arrow::SparseTensorCOO> const &) = 0;
+#define pyarrow_wrap_sparse_tensor_coo
__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo
static std::shared_ptr< arrow::Array>
(*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0;
#define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array
static std::shared_ptr< arrow::RecordBatch>
(*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0;
@@ -68,6 +72,10 @@ static std::shared_ptr< arrow::Table>
(*__pyx_api_f_7pyarrow_3lib_pyarrow_unwra
#define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table
static std::shared_ptr< arrow::Tensor>
(*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0;
#define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor
+static std::shared_ptr< arrow::SparseTensorCSR>
(*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_tensor_csr
__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr
+static std::shared_ptr< arrow::SparseTensorCOO>
(*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_tensor_coo
__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo
static int
(*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const
&) = 0;
#define pyarrow_internal_check_status
__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status
static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0;
@@ -84,6 +92,10 @@ static PyObject
*(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_pt
#define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar
static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0;
#define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr)(PyObject
*) = 0;
+#define pyarrow_is_sparse_tensor_csr
__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo)(PyObject
*) = 0;
+#define pyarrow_is_sparse_tensor_coo
__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo
static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0;
#define pyarrow_is_column __pyx_api_f_7pyarrow_3lib_pyarrow_is_column
static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0;
@@ -167,6 +179,8 @@ static int import_pyarrow__lib(void) {
if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject
*(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject
*(std::shared_ptr< arrow::Table> const &)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject
*(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_tensor_csr", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr, "PyObject
*(std::shared_ptr< arrow::SparseTensorCSR> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_tensor_coo", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr, "PyObject
*(std::shared_ptr< arrow::SparseTensorCOO> const &)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr<
arrow::Array> (PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr<
arrow::RecordBatch> (PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr<
arrow::Buffer> (PyObject *)") < 0) goto bad;
@@ -176,6 +190,8 @@ static int import_pyarrow__lib(void) {
if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr<
arrow::Schema> (PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr<
arrow::Table> (PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr<
arrow::Tensor> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_tensor_csr", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr,
"std::shared_ptr< arrow::SparseTensorCSR> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_tensor_coo", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo,
"std::shared_ptr< arrow::SparseTensorCOO> (PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_internal_check_status", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int
(arrow::Status const &)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") <
0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)")
< 0) goto bad;
@@ -184,6 +200,8 @@ static int import_pyarrow__lib(void) {
if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") <
0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_wrap_scalar", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject
*(std::shared_ptr< arrow::Scalar> const &)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") <
0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_csr", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr, "int
(PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_coo", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo, "int
(PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") <
0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") <
0) goto bad;
if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void
(**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") <
0) goto bad;
diff --git a/cpp/src/arrow/python/pyarrow_lib.h
b/cpp/src/arrow/python/pyarrow_lib.h
index 5f5fc4c..a4bc103 100644
--- a/cpp/src/arrow/python/pyarrow_lib.h
+++ b/cpp/src/arrow/python/pyarrow_lib.h
@@ -48,6 +48,8 @@ __PYX_EXTERN_C PyObject
*__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std
__PYX_EXTERN_C PyObject
*__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema>
const &);
__PYX_EXTERN_C PyObject
*__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const
&);
__PYX_EXTERN_C PyObject
*__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor>
const &);
+__PYX_EXTERN_C PyObject
*__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo(std::shared_ptr<
arrow::SparseTensorCOO> const &);
+__PYX_EXTERN_C PyObject
*__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr(std::shared_ptr<
arrow::SparseTensorCSR> const &);
__PYX_EXTERN_C std::shared_ptr< arrow::Array>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *);
__PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *);
__PYX_EXTERN_C std::shared_ptr< arrow::Buffer>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *);
@@ -57,6 +59,8 @@ __PYX_EXTERN_C std::shared_ptr< arrow::Field>
__pyx_f_7pyarrow_3lib_pyarrow_unw
__PYX_EXTERN_C std::shared_ptr< arrow::Schema>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *);
__PYX_EXTERN_C std::shared_ptr< arrow::Table>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *);
__PYX_EXTERN_C std::shared_ptr< arrow::Tensor>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCOO>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCSR>
__pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr(PyObject *);
#endif /* !__PYX_HAVE_API__pyarrow__lib */
diff --git a/cpp/src/arrow/python/serialize.cc
b/cpp/src/arrow/python/serialize.cc
index 8ff0e01..d93e395 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -515,7 +515,7 @@ Status AppendArray(PyObject* context, PyArrayObject* array,
SequenceBuilder* bui
builder->AppendNdarray(static_cast<int32_t>(blobs_out->ndarrays.size())));
std::shared_ptr<Tensor> tensor;
RETURN_NOT_OK(NdarrayToTensor(default_memory_pool(),
- reinterpret_cast<PyObject*>(array),
&tensor));
+ reinterpret_cast<PyObject*>(array), {},
&tensor));
blobs_out->ndarrays.push_back(tensor);
} break;
default: {
diff --git a/cpp/src/arrow/sparse_tensor-test.cc
b/cpp/src/arrow/sparse_tensor-test.cc
index daff019..69ec4ca 100644
--- a/cpp/src/arrow/sparse_tensor-test.cc
+++ b/cpp/src/arrow/sparse_tensor-test.cc
@@ -182,6 +182,25 @@ TEST(TestSparseCOOTensor, CreationFromNonContiguousTensor)
{
AssertCOOIndex(sidx, 11, {1, 2, 3});
}
+TEST(TestSparseCOOTensor, TensorEquality) {
+ std::vector<int64_t> shape = {2, 3, 4};
+ std::vector<int64_t> values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
+ 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16};
+ std::vector<int64_t> values2 = {0, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
+ 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16};
+ std::shared_ptr<Buffer> buffer1 = Buffer::Wrap(values1);
+ std::shared_ptr<Buffer> buffer2 = Buffer::Wrap(values2);
+ NumericTensor<Int64Type> tensor1(buffer1, shape);
+ NumericTensor<Int64Type> tensor2(buffer1, shape);
+ NumericTensor<Int64Type> tensor3(buffer2, shape);
+ SparseTensorImpl<SparseCOOIndex> st1(tensor1);
+ SparseTensorImpl<SparseCOOIndex> st2(tensor2);
+ SparseTensorImpl<SparseCOOIndex> st3(tensor3);
+
+ ASSERT_TRUE(st1.Equals(st2));
+ ASSERT_TRUE(!st1.Equals(st3));
+}
+
TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) {
std::vector<int64_t> shape = {6, 4};
std::vector<int64_t> values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
@@ -269,4 +288,24 @@ TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor)
{
ASSERT_EQ(std::vector<int64_t>({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}),
indices_values);
}
+TEST(TestSparseCSRMatrix, TensorEquality) {
+ std::vector<int64_t> shape = {6, 4};
+ std::vector<int64_t> values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
+ 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16};
+ std::vector<int64_t> values2 = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+ std::shared_ptr<Buffer> buffer1 = Buffer::Wrap(values1);
+ std::shared_ptr<Buffer> buffer2 = Buffer::Wrap(values2);
+ NumericTensor<Int64Type> tensor1(buffer1, shape);
+ NumericTensor<Int64Type> tensor2(buffer1, shape);
+ NumericTensor<Int64Type> tensor3(buffer2, shape);
+ SparseTensorImpl<SparseCSRIndex> st1(tensor1);
+ SparseTensorImpl<SparseCSRIndex> st2(tensor2);
+ SparseTensorImpl<SparseCSRIndex> st3(tensor3);
+
+ ASSERT_TRUE(st1.Equals(st2));
+ ASSERT_TRUE(!st1.Equals(st3));
+}
+
} // namespace arrow
diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst
index 6b5c9ce..f15b1be 100644
--- a/docs/source/python/extending.rst
+++ b/docs/source/python/extending.rst
@@ -116,6 +116,16 @@ C++ objects.
Return whether *obj* wraps an Arrow C++ :class:`Tensor` pointer;
in other words, whether *obj* is a :py:class:`pyarrow.Tensor` instance.
+.. function:: bool is_sparse_tensor_coo(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`SparseTensorCOO` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.SparseTensorCOO`
instance.
+
+.. function:: bool is_sparse_tensor_csr(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`SparseTensorCSR` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.SparseTensorCSR`
instance.
+
The following functions expect a pyarrow object, unwrap the underlying
Arrow C++ API pointer, and put it in the *out* parameter. The returned
:class:`Status` object must be inspected first to know whether any error
@@ -157,6 +167,14 @@ occurred. If successful, *out* is guaranteed to be
non-NULL.
Unwrap the Arrow C++ :class:`Tensor` pointer from *obj* and put it in *out*.
+.. function:: Status unwrap_sparse_tensor_coo(PyObject* obj,
std::shared_ptr<SparseTensorCOO>* out)
+
+ Unwrap the Arrow C++ :class:`SparseTensorCOO` pointer from *obj* and put it
in *out*.
+
+.. function:: Status unwrap_sparse_tensor_csr(PyObject* obj,
std::shared_ptr<SparseTensorCSR>* out)
+
+ Unwrap the Arrow C++ :class:`SparseTensorCSR` pointer from *obj* and put it
in *out*.
+
The following functions take an Arrow C++ API pointer and wrap it in a
pyarray object of the corresponding type. A new reference is returned.
On error, NULL is returned and a Python exception is set.
@@ -197,6 +215,14 @@ On error, NULL is returned and a Python exception is set.
Wrap the Arrow C++ *tensor* in a :py:class:`pyarrow.Tensor` instance.
+.. function:: PyObject* wrap_sparse_tensor_coo(const
std::shared_ptr<SparseTensorCOO>& sparse_tensor)
+
+ Wrap the Arrow C++ *COO sparse tensor* in a
:py:class:`pyarrow.SparseTensorCOO` instance.
+
+.. function:: PyObject* wrap_sparse_tensor_csr(const
std::shared_ptr<SparseTensorCSR>& sparse_tensor)
+
+ Wrap the Arrow C++ *CSR sparse tensor* in a
:py:class:`pyarrow.SparseTensorCSR` instance.
+
Cython API
----------
@@ -257,6 +283,14 @@ an exception) if the input is not of the right type.
Unwrap the Arrow C++ :cpp:class:`Tensor` pointer from *obj*.
+.. function:: pyarrow_unwrap_sparse_tensor_coo(obj) ->
shared_ptr[CSparseTensorCOO]
+
+ Unwrap the Arrow C++ :cpp:class:`SparseTensorCOO` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_sparse_tensor_csr(obj) ->
shared_ptr[CSparseTensorCSR]
+
+ Unwrap the Arrow C++ :cpp:class:`SparseTensorCSR` pointer from *obj*.
+
The following functions take a Arrow C++ API pointer and wrap it in a
pyarray object of the corresponding type. An exception is raised on error.
@@ -300,6 +334,14 @@ pyarray object of the corresponding type. An exception is
raised on error.
Wrap the Arrow C++ *tensor* in a Python :class:`pyarrow.Tensor` instance.
+.. function:: pyarrow_wrap_sparse_tensor_coo(sp_array: const
shared_ptr[CSparseTensorCOO]& sparse_tensor) -> object
+
+ Wrap the Arrow C++ *COO sparse tensor* in a Python
:class:`pyarrow.SparseTensorCOO` instance.
+
+.. function:: pyarrow_wrap_sparse_tensor_csr(sp_array: const
shared_ptr[CSparseTensorCSR]& sparse_tensor) -> object
+
+ Wrap the Arrow C++ *CSR sparse tensor* in a Python
:class:`pyarrow.SparseTensorCSR` instance.
+
Example
~~~~~~~
diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd
index 95cea5c..4328805 100644
--- a/python/pyarrow/__init__.pxd
+++ b/python/pyarrow/__init__.pxd
@@ -20,8 +20,9 @@ from __future__ import absolute_import
from libcpp.memory cimport shared_ptr
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType,
CField, CRecordBatch, CSchema,
- CTable, CTensor)
-
+ CTable, CTensor,
+ CSparseTensorCSR, CSparseTensorCOO)
+from pyarrow.compat import frombytes
cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
cdef int import_pyarrow() except -1
@@ -31,6 +32,10 @@ cdef extern from "arrow/python/pyarrow.h" namespace
"arrow::py":
cdef object wrap_schema(const shared_ptr[CSchema]& schema)
cdef object wrap_array(const shared_ptr[CArray]& sp_array)
cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
+ cdef object wrap_sparse_tensor_coo(
+ const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor)
+ cdef object wrap_sparse_tensor_csr(
+ const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor)
cdef object wrap_column(const shared_ptr[CColumn]& ccolumn)
cdef object wrap_table(const shared_ptr[CTable]& ctable)
cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch)
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 487065c..bbbd91a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -66,6 +66,7 @@ from pyarrow.lib import (null, bool_,
schema,
Array, Tensor,
array, chunked_array, column, table,
+ SparseTensorCSR, SparseTensorCOO,
infer_type, from_numpy_dtype,
NullArray,
NumericArray, IntegerArray, FloatingPointArray,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5ae178d..15905a1 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -870,104 +870,6 @@ cdef class Array(_PandasConvertible):
return res
-cdef class Tensor:
- """
- A n-dimensional array a.k.a Tensor.
- """
-
- def __init__(self):
- raise TypeError("Do not call Tensor's constructor directly, use one "
- "of the `pyarrow.Tensor.from_*` functions instead.")
-
- cdef void init(self, const shared_ptr[CTensor]& sp_tensor):
- self.sp_tensor = sp_tensor
- self.tp = sp_tensor.get()
- self.type = pyarrow_wrap_data_type(self.tp.type())
-
- def __repr__(self):
- return """<pyarrow.Tensor>
-type: {0.type}
-shape: {0.shape}
-strides: {0.strides}""".format(self)
-
- @staticmethod
- def from_numpy(obj):
- cdef shared_ptr[CTensor] ctensor
- with nogil:
- check_status(NdarrayToTensor(c_default_memory_pool(), obj,
- &ctensor))
- return pyarrow_wrap_tensor(ctensor)
-
- def to_numpy(self):
- """
- Convert arrow::Tensor to numpy.ndarray with zero copy
- """
- cdef PyObject* out
-
- with nogil:
- check_status(TensorToNdarray(self.sp_tensor, self, &out))
- return PyObject_to_object(out)
-
- def equals(self, Tensor other):
- """
- Return true if the tensors contains exactly equal data
- """
- return self.tp.Equals(deref(other.tp))
-
- def __eq__(self, other):
- if isinstance(other, Tensor):
- return self.equals(other)
- else:
- return NotImplemented
-
- @property
- def is_mutable(self):
- return self.tp.is_mutable()
-
- @property
- def is_contiguous(self):
- return self.tp.is_contiguous()
-
- @property
- def ndim(self):
- return self.tp.ndim()
-
- @property
- def size(self):
- return self.tp.size()
-
- @property
- def shape(self):
- # Cython knows how to convert a vector[T] to a Python list
- return tuple(self.tp.shape())
-
- @property
- def strides(self):
- return tuple(self.tp.strides())
-
- def __getbuffer__(self, cp.Py_buffer* buffer, int flags):
- buffer.buf = <char *> self.tp.data().get().data()
- pep3118_format = self.type.pep3118_format
- if pep3118_format is None:
- raise NotImplementedError("type %s not supported for buffer "
- "protocol" % (self.type,))
- buffer.format = pep3118_format
- buffer.itemsize = self.type.bit_width // 8
- buffer.internal = NULL
- buffer.len = self.tp.size() * buffer.itemsize
- buffer.ndim = self.tp.ndim()
- buffer.obj = self
- if self.tp.is_mutable():
- buffer.readonly = 0
- else:
- buffer.readonly = 1
- # NOTE: This assumes Py_ssize_t == int64_t, and that the shape
- # and strides arrays lifetime is tied to the tensor's
- buffer.shape = <Py_ssize_t *> &self.tp.shape()[0]
- buffer.strides = <Py_ssize_t *> &self.tp.strides()[0]
- buffer.suboffsets = NULL
-
-
cdef wrap_array_output(PyObject* output):
cdef object obj = PyObject_to_object(output)
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 8798834..93a7594 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -593,6 +593,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int64_t size()
int ndim()
+ const vector[c_string]& dim_names()
const c_string& dim_name(int i)
c_bool is_mutable()
@@ -600,6 +601,38 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
Type type_id()
c_bool Equals(const CTensor& other)
+ cdef cppclass CSparseTensorCOO" arrow::SparseTensorCOO":
+ shared_ptr[CDataType] type()
+ shared_ptr[CBuffer] data()
+
+ const vector[int64_t]& shape()
+ int64_t size()
+ int64_t non_zero_length()
+
+ int ndim()
+ const vector[c_string]& dim_names()
+ const c_string& dim_name(int i)
+
+ c_bool is_mutable()
+ Type type_id()
+ c_bool Equals(const CSparseTensorCOO& other)
+
+ cdef cppclass CSparseTensorCSR" arrow::SparseTensorCSR":
+ shared_ptr[CDataType] type()
+ shared_ptr[CBuffer] data()
+
+ const vector[int64_t]& shape()
+ int64_t size()
+ int64_t non_zero_length()
+
+ int ndim()
+ const vector[c_string]& dim_names()
+ const c_string& dim_name(int i)
+
+ c_bool is_mutable()
+ Type type_id()
+ c_bool Equals(const CSparseTensorCSR& other)
+
cdef cppclass CScalar" arrow::Scalar":
shared_ptr[CDataType] type
@@ -1202,11 +1235,38 @@ cdef extern from "arrow/python/api.h" namespace
"arrow::py" nogil:
shared_ptr[CChunkedArray]* out)
CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
+ const vector[c_string]& dim_names,
shared_ptr[CTensor]* out)
CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base,
PyObject** out)
+ CStatus SparseTensorCOOToNdarray(
+ const shared_ptr[CSparseTensorCOO]& sparse_tensor, object base,
+ PyObject** out_data, PyObject** out_coords)
+
+ CStatus SparseTensorCSRToNdarray(
+ const shared_ptr[CSparseTensorCSR]& sparse_tensor, object base,
+ PyObject** out_data, PyObject** out_indptr, PyObject** out_indices)
+
+ CStatus NdarraysToSparseTensorCOO(CMemoryPool* pool, object data_ao,
+ object coords_ao,
+ const vector[int64_t]& shape,
+ const vector[c_string]& dim_names,
+ shared_ptr[CSparseTensorCOO]* out)
+
+ CStatus NdarraysToSparseTensorCSR(CMemoryPool* pool, object data_ao,
+ object indptr_ao, object indices_ao,
+ const vector[int64_t]& shape,
+ const vector[c_string]& dim_names,
+ shared_ptr[CSparseTensorCSR]* out)
+
+ CStatus TensorToSparseTensorCOO(shared_ptr[CTensor],
+ shared_ptr[CSparseTensorCOO]* out)
+
+ CStatus TensorToSparseTensorCSR(shared_ptr[CTensor],
+ shared_ptr[CSparseTensorCSR]* out)
+
CStatus ConvertArrayToPandas(const PandasOptions& options,
const shared_ptr[CArray]& arr,
object py_ref, PyObject** out)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 79ab947..898c70a 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -231,6 +231,28 @@ cdef class Tensor:
cdef void init(self, const shared_ptr[CTensor]& sp_tensor)
+cdef class SparseTensorCSR:
+ cdef:
+ shared_ptr[CSparseTensorCSR] sp_sparse_tensor
+ CSparseTensorCSR* stp
+
+ cdef readonly:
+ DataType type
+
+ cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor)
+
+
+cdef class SparseTensorCOO:
+ cdef:
+ shared_ptr[CSparseTensorCOO] sp_sparse_tensor
+ CSparseTensorCOO* stp
+
+ cdef readonly:
+ DataType type
+
+ cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor)
+
+
cdef class NullArray(Array):
pass
@@ -452,6 +474,10 @@ cdef public object pyarrow_wrap_resizable_buffer(
cdef public object pyarrow_wrap_schema(const shared_ptr[CSchema]& type)
cdef public object pyarrow_wrap_table(const shared_ptr[CTable]& ctable)
cdef public object pyarrow_wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
+cdef public object pyarrow_wrap_sparse_tensor_coo(
+ const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor)
+cdef public object pyarrow_wrap_sparse_tensor_csr(
+ const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor)
cdef public shared_ptr[CArray] pyarrow_unwrap_array(object array)
cdef public shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch)
@@ -462,3 +488,7 @@ cdef public shared_ptr[CField] pyarrow_unwrap_field(object
field)
cdef public shared_ptr[CSchema] pyarrow_unwrap_schema(object schema)
cdef public shared_ptr[CTable] pyarrow_unwrap_table(object table)
cdef public shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor)
+cdef public shared_ptr[CSparseTensorCOO] pyarrow_unwrap_sparse_tensor_coo(
+ object sparse_tensor)
+cdef public shared_ptr[CSparseTensorCSR] pyarrow_unwrap_sparse_tensor_csr(
+ object sparse_tensor)
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 783e2b2..2da5a83 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -121,6 +121,9 @@ include "builder.pxi"
# Column, Table, Record Batch
include "table.pxi"
+# Tensors
+include "tensor.pxi"
+
# File IO
include "io.pxi"
include "io-hdfs.pxi"
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 33bc803..05c0774 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -18,7 +18,8 @@
from libcpp.memory cimport shared_ptr
from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField,
CRecordBatch, CSchema,
- CTable, CTensor)
+ CTable, CTensor,
+ CSparseTensorCSR, CSparseTensorCOO)
# You cannot assign something to a dereferenced pointer in Cython thus these
# methods don't use Status to indicate a successful operation.
@@ -225,6 +226,7 @@ cdef api object pyarrow_wrap_scalar(const
shared_ptr[CScalar]& sp_scalar):
scalar.init(sp_scalar)
return scalar
+
cdef api bint pyarrow_is_tensor(object tensor):
return isinstance(tensor, Tensor)
@@ -248,6 +250,52 @@ cdef api object pyarrow_wrap_tensor(
return tensor
+cdef api bint pyarrow_is_sparse_tensor_coo(object sparse_tensor):
+ return isinstance(sparse_tensor, SparseTensorCOO)
+
+cdef api shared_ptr[CSparseTensorCOO] pyarrow_unwrap_sparse_tensor_coo(
+ object sparse_tensor):
+ cdef SparseTensorCOO sten
+ if pyarrow_is_sparse_tensor_coo(sparse_tensor):
+ sten = <SparseTensorCOO>(sparse_tensor)
+ return sten.sp_sparse_tensor
+
+ return shared_ptr[CSparseTensorCOO]()
+
+cdef api object pyarrow_wrap_sparse_tensor_coo(
+ const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor):
+ if sp_sparse_tensor.get() == NULL:
+ raise ValueError('SparseTensorCOO was NULL')
+
+ cdef SparseTensorCOO sparse_tensor = SparseTensorCOO.__new__(
+ SparseTensorCOO)
+ sparse_tensor.init(sp_sparse_tensor)
+ return sparse_tensor
+
+
+cdef api bint pyarrow_is_sparse_tensor_csr(object sparse_tensor):
+ return isinstance(sparse_tensor, SparseTensorCSR)
+
+cdef api shared_ptr[CSparseTensorCSR] pyarrow_unwrap_sparse_tensor_csr(
+ object sparse_tensor):
+ cdef SparseTensorCSR sten
+ if pyarrow_is_sparse_tensor_csr(sparse_tensor):
+ sten = <SparseTensorCSR>(sparse_tensor)
+ return sten.sp_sparse_tensor
+
+ return shared_ptr[CSparseTensorCSR]()
+
+cdef api object pyarrow_wrap_sparse_tensor_csr(
+ const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor):
+ if sp_sparse_tensor.get() == NULL:
+ raise ValueError('SparseTensorCSR was NULL')
+
+ cdef SparseTensorCSR sparse_tensor = SparseTensorCSR.__new__(
+ SparseTensorCSR)
+ sparse_tensor.init(sp_sparse_tensor)
+ return sparse_tensor
+
+
cdef api bint pyarrow_is_column(object column):
return isinstance(column, Column)
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
new file mode 100644
index 0000000..17554e6
--- /dev/null
+++ b/python/pyarrow/tensor.pxi
@@ -0,0 +1,367 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+cdef class Tensor:
+ """
+ A n-dimensional array a.k.a Tensor.
+ """
+
+ def __init__(self):
+ raise TypeError("Do not call Tensor's constructor directly, use one "
+ "of the `pyarrow.Tensor.from_*` functions instead.")
+
+ cdef void init(self, const shared_ptr[CTensor]& sp_tensor):
+ self.sp_tensor = sp_tensor
+ self.tp = sp_tensor.get()
+ self.type = pyarrow_wrap_data_type(self.tp.type())
+
+ def __repr__(self):
+ return """<pyarrow.Tensor>
+type: {0.type}
+shape: {0.shape}
+strides: {0.strides}""".format(self)
+
+ @staticmethod
+ def from_numpy(obj, dim_names=None):
+ cdef:
+ vector[c_string] c_dim_names
+ shared_ptr[CTensor] ctensor
+
+ if dim_names is not None:
+ for x in dim_names:
+ c_dim_names.push_back(tobytes(x))
+
+ check_status(NdarrayToTensor(c_default_memory_pool(), obj,
+ c_dim_names, &ctensor))
+ return pyarrow_wrap_tensor(ctensor)
+
+ def to_numpy(self):
+ """
+ Convert arrow::Tensor to numpy.ndarray with zero copy
+ """
+ cdef PyObject* out
+
+ check_status(TensorToNdarray(self.sp_tensor, self, &out))
+ return PyObject_to_object(out)
+
+ def equals(self, Tensor other):
+ """
+ Return true if the tensors contains exactly equal data
+ """
+ return self.tp.Equals(deref(other.tp))
+
+ def __eq__(self, other):
+ if isinstance(other, Tensor):
+ return self.equals(other)
+ else:
+ return NotImplemented
+
+ def dim_name(self, i):
+ return frombytes(self.tp.dim_name(i))
+
+ @property
+ def dim_names(self):
+ return [frombytes(x) for x in tuple(self.tp.dim_names())]
+
+ @property
+ def is_mutable(self):
+ return self.tp.is_mutable()
+
+ @property
+ def is_contiguous(self):
+ return self.tp.is_contiguous()
+
+ @property
+ def ndim(self):
+ return self.tp.ndim()
+
+ @property
+ def size(self):
+ return self.tp.size()
+
+ @property
+ def shape(self):
+ # Cython knows how to convert a vector[T] to a Python list
+ return tuple(self.tp.shape())
+
+ @property
+ def strides(self):
+ return tuple(self.tp.strides())
+
+ def __getbuffer__(self, cp.Py_buffer* buffer, int flags):
+ buffer.buf = <char *> self.tp.data().get().data()
+ pep3118_format = self.type.pep3118_format
+ if pep3118_format is None:
+ raise NotImplementedError("type %s not supported for buffer "
+ "protocol" % (self.type,))
+ buffer.format = pep3118_format
+ buffer.itemsize = self.type.bit_width // 8
+ buffer.internal = NULL
+ buffer.len = self.tp.size() * buffer.itemsize
+ buffer.ndim = self.tp.ndim()
+ buffer.obj = self
+ if self.tp.is_mutable():
+ buffer.readonly = 0
+ else:
+ buffer.readonly = 1
+ # NOTE: This assumes Py_ssize_t == int64_t, and that the shape
+ # and strides arrays lifetime is tied to the tensor's
+ buffer.shape = <Py_ssize_t *> &self.tp.shape()[0]
+ buffer.strides = <Py_ssize_t *> &self.tp.strides()[0]
+ buffer.suboffsets = NULL
+
+
+cdef class SparseTensorCOO:
+ """
+ A sparse COO tensor.
+ """
+
+ def __init__(self):
+ raise TypeError("Do not call SparseTensorCOO's constructor directly, "
+ "use one of the `pyarrow.SparseTensorCOO.from_*` "
+ "functions instead.")
+
+ cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor):
+ self.sp_sparse_tensor = sp_sparse_tensor
+ self.stp = sp_sparse_tensor.get()
+ self.type = pyarrow_wrap_data_type(self.stp.type())
+
+ def __repr__(self):
+ return """<pyarrow.SparseTensorCOO>
+type: {0.type}
+shape: {0.shape}""".format(self)
+
+ @classmethod
+ def from_dense_numpy(cls, obj, dim_names=None):
+ """
+ Convert numpy.ndarray to arrow::SparseTensorCOO
+ """
+ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names))
+
+ @staticmethod
+ def from_numpy(data, coords, shape, dim_names=None):
+ """
+ Create arrow::SparseTensorCOO from numpy.ndarrays
+ """
+ cdef shared_ptr[CSparseTensorCOO] csparse_tensor
+ cdef vector[int64_t] c_shape
+ cdef vector[c_string] c_dim_names
+
+ for x in shape:
+ c_shape.push_back(x)
+ if dim_names is not None:
+ for x in dim_names:
+ c_dim_names.push_back(tobytes(x))
+
+ # Enforce precondition for SparseTensorCOO indices
+ coords = np.require(coords, dtype='i8', requirements='F')
+ if coords.ndim != 2:
+ raise ValueError("Expected 2-dimensional array for "
+ "SparseTensorCOO indices")
+
+ check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(),
+ data, coords, c_shape, c_dim_names, &csparse_tensor))
+ return pyarrow_wrap_sparse_tensor_coo(csparse_tensor)
+
+ @staticmethod
+ def from_tensor(obj):
+ """
+ Convert arrow::Tensor to arrow::SparseTensorCOO
+ """
+ cdef shared_ptr[CSparseTensorCOO] csparse_tensor
+ cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj)
+
+ with nogil:
+ check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor))
+
+ return pyarrow_wrap_sparse_tensor_coo(csparse_tensor)
+
+ def to_numpy(self):
+ """
+ Convert arrow::SparseTensorCOO to numpy.ndarrays with zero copy
+ """
+ cdef PyObject* out_data
+ cdef PyObject* out_coords
+
+ check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self,
+ &out_data, &out_coords))
+ return PyObject_to_object(out_data), PyObject_to_object(out_coords)
+
+ def equals(self, SparseTensorCOO other):
+ """
+ Return true if sparse tensors contains exactly equal data
+ """
+ return self.stp.Equals(deref(other.stp))
+
+ def __eq__(self, other):
+ if isinstance(other, SparseTensorCOO):
+ return self.equals(other)
+ else:
+ return NotImplemented
+
+ @property
+ def is_mutable(self):
+ return self.stp.is_mutable()
+
+ @property
+ def ndim(self):
+ return self.stp.ndim()
+
+ @property
+ def shape(self):
+ # Cython knows how to convert a vector[T] to a Python list
+ return tuple(self.stp.shape())
+
+ @property
+ def size(self):
+ return self.stp.size()
+
+ def dim_name(self, i):
+ return frombytes(self.stp.dim_name(i))
+
+ @property
+ def dim_names(self):
+ return [frombytes(x) for x in tuple(self.stp.dim_names())]
+
+ @property
+ def non_zero_length(self):
+ return self.stp.non_zero_length()
+
+
+cdef class SparseTensorCSR:
+ """
+ A sparse CSR tensor.
+ """
+
+ def __init__(self):
+ raise TypeError("Do not call SparseTensorCSR's constructor directly, "
+ "use one of the `pyarrow.SparseTensorCSR.from_*` "
+ "functions instead.")
+
+ cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor):
+ self.sp_sparse_tensor = sp_sparse_tensor
+ self.stp = sp_sparse_tensor.get()
+ self.type = pyarrow_wrap_data_type(self.stp.type())
+
+ def __repr__(self):
+ return """<pyarrow.SparseTensorCSR>
+type: {0.type}
+shape: {0.shape}""".format(self)
+
+ @classmethod
+ def from_dense_numpy(cls, obj, dim_names=None):
+ """
+ Convert numpy.ndarray to arrow::SparseTensorCSR
+ """
+ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names))
+
+ @staticmethod
+ def from_numpy(data, indptr, indices, shape, dim_names=None):
+ """
+ Create arrow::SparseTensorCSR from numpy.ndarrays
+ """
+ cdef shared_ptr[CSparseTensorCSR] csparse_tensor
+ cdef vector[int64_t] c_shape
+ cdef vector[c_string] c_dim_names
+
+ for x in shape:
+ c_shape.push_back(x)
+ if dim_names is not None:
+ for x in dim_names:
+ c_dim_names.push_back(tobytes(x))
+
+ # Enforce precondition for SparseTensorCSR indices
+ indptr = np.require(indptr, dtype='i8')
+ indices = np.require(indices, dtype='i8')
+ if indptr.ndim != 1:
+ raise ValueError("Expected 1-dimensional array for "
+ "SparseTensorCSR indptr")
+ if indices.ndim != 1:
+ raise ValueError("Expected 1-dimensional array for "
+ "SparseTensorCSR indices")
+
+ check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(),
+ data, indptr, indices, c_shape, c_dim_names,
+ &csparse_tensor))
+ return pyarrow_wrap_sparse_tensor_csr(csparse_tensor)
+
+ @staticmethod
+ def from_tensor(obj):
+ """
+ Convert arrow::Tensor to arrow::SparseTensorCSR
+ """
+ cdef shared_ptr[CSparseTensorCSR] csparse_tensor
+ cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj)
+
+ with nogil:
+ check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor))
+
+ return pyarrow_wrap_sparse_tensor_csr(csparse_tensor)
+
+ def to_numpy(self):
+ """
+ Convert arrow::SparseTensorCSR to numpy.ndarrays with zero copy
+ """
+ cdef PyObject* out_data
+ cdef PyObject* out_indptr
+ cdef PyObject* out_indices
+
+ check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self,
+ &out_data, &out_indptr, &out_indices))
+ return (PyObject_to_object(out_data), PyObject_to_object(out_indptr),
+ PyObject_to_object(out_indices))
+
+ def equals(self, SparseTensorCSR other):
+ """
+ Return true if sparse tensors contains exactly equal data
+ """
+ return self.stp.Equals(deref(other.stp))
+
+ def __eq__(self, other):
+ if isinstance(other, SparseTensorCSR):
+ return self.equals(other)
+ else:
+ return NotImplemented
+
+ @property
+ def is_mutable(self):
+ return self.stp.is_mutable()
+
+ @property
+ def ndim(self):
+ return self.stp.ndim()
+
+ @property
+ def shape(self):
+ # Cython knows how to convert a vector[T] to a Python list
+ return tuple(self.stp.shape())
+
+ @property
+ def size(self):
+ return self.stp.size()
+
+ def dim_name(self, i):
+ return frombytes(self.stp.dim_name(i))
+
+ @property
+ def dim_names(self):
+ return [frombytes(x) for x in tuple(self.stp.dim_names())]
+
+ @property
+ def non_zero_length(self):
+ return self.stp.non_zero_length()
diff --git a/python/pyarrow/tests/test_sparse_tensor.py
b/python/pyarrow/tests/test_sparse_tensor.py
new file mode 100644
index 0000000..68564da
--- /dev/null
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import sys
+
+import numpy as np
+import pyarrow as pa
+
+
+tensor_type_pairs = [
+ ('i1', pa.int8()),
+ ('i2', pa.int16()),
+ ('i4', pa.int32()),
+ ('i8', pa.int64()),
+ ('u1', pa.uint8()),
+ ('u2', pa.uint16()),
+ ('u4', pa.uint32()),
+ ('u8', pa.uint64()),
+ ('f2', pa.float16()),
+ ('f4', pa.float32()),
+ ('f8', pa.float64())
+]
+
+
[email protected]('sparse_tensor_type', [
+ pa.SparseTensorCSR,
+ pa.SparseTensorCOO,
+])
+def test_sparse_tensor_attrs(sparse_tensor_type):
+ data = np.array([
+ [0, 1, 0, 0, 1],
+ [0, 0, 0, 0, 0],
+ [0, 0, 0, 1, 0],
+ [0, 0, 0, 0, 0],
+ [0, 3, 0, 0, 0],
+ ])
+ dim_names = ['x', 'y']
+ sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names)
+
+ assert sparse_tensor.ndim == 2
+ assert sparse_tensor.size == 25
+ assert sparse_tensor.shape == data.shape
+ assert sparse_tensor.is_mutable
+ assert sparse_tensor.dim_name(0) == dim_names[0]
+ assert sparse_tensor.dim_names == dim_names
+ assert sparse_tensor.non_zero_length == 4
+
+
+def test_sparse_tensor_coo_base_object():
+ data = np.array([[4], [9], [7], [5]])
+ coords = np.array([[0, 0], [0, 2], [1, 1], [3, 3]])
+ array = np.array([[4, 0, 9, 0],
+ [0, 7, 0, 0],
+ [0, 0, 0, 0],
+ [0, 0, 0, 5]])
+ sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(array)
+ n = sys.getrefcount(sparse_tensor)
+ result_data, result_coords = sparse_tensor.to_numpy()
+ assert sys.getrefcount(sparse_tensor) == n + 2
+
+ sparse_tensor = None
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(coords, result_coords)
+ assert result_coords.flags.f_contiguous # column-major
+
+
+def test_sparse_tensor_csr_base_object():
+ data = np.array([[1], [2], [3], [4], [5], [6]])
+ indptr = np.array([0, 2, 3, 6])
+ indices = np.array([0, 2, 2, 0, 1, 2])
+ array = np.array([[1, 0, 2],
+ [0, 0, 3],
+ [4, 5, 6]])
+
+ sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(array)
+ n = sys.getrefcount(sparse_tensor)
+ result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
+ assert sys.getrefcount(sparse_tensor) == n + 3
+
+ sparse_tensor = None
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(indptr, result_indptr)
+ assert np.array_equal(indices, result_indices)
+
+
[email protected]('sparse_tensor_type', [
+ pa.SparseTensorCSR,
+ pa.SparseTensorCOO,
+])
+def test_sparse_tensor_equals(sparse_tensor_type):
+ def eq(a, b):
+ assert a.equals(b)
+ assert a == b
+ assert not (a != b)
+
+ def ne(a, b):
+ assert not a.equals(b)
+ assert not (a == b)
+ assert a != b
+
+ data = np.random.randn(10, 6)[::, ::2]
+ sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data)
+ sparse_tensor2 = sparse_tensor_type.from_dense_numpy(
+ np.ascontiguousarray(data))
+ eq(sparse_tensor1, sparse_tensor2)
+ data = data.copy()
+ data[9, 0] = 1.0
+ sparse_tensor2 = sparse_tensor_type.from_dense_numpy(
+ np.ascontiguousarray(data))
+ ne(sparse_tensor1, sparse_tensor2)
+
+
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
+def test_sparse_tensor_coo_from_dense(dtype_str, arrow_type):
+ dtype = np.dtype(dtype_str)
+ data = np.array([[4], [9], [7], [5]]).astype(dtype)
+ coords = np.array([[0, 0], [0, 2], [1, 1], [3, 3]])
+ array = np.array([[4, 0, 9, 0],
+ [0, 7, 0, 0],
+ [0, 0, 0, 0],
+ [0, 0, 0, 5]]).astype(dtype)
+ tensor = pa.Tensor.from_numpy(array)
+
+ # Test from numpy array
+ sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(array)
+ repr(sparse_tensor)
+ assert sparse_tensor.type == arrow_type
+ result_data, result_coords = sparse_tensor.to_numpy()
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(coords, result_coords)
+
+ # Test from Tensor
+ sparse_tensor = pa.SparseTensorCOO.from_tensor(tensor)
+ repr(sparse_tensor)
+ assert sparse_tensor.type == arrow_type
+ result_data, result_coords = sparse_tensor.to_numpy()
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(coords, result_coords)
+
+
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
+def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type):
+ dtype = np.dtype(dtype_str)
+ dense_data = np.array([[1, 0, 2],
+ [0, 0, 3],
+ [4, 5, 6]]).astype(dtype)
+
+ data = np.array([[1], [2], [3], [4], [5], [6]])
+ indptr = np.array([0, 2, 3, 6])
+ indices = np.array([0, 2, 2, 0, 1, 2])
+ tensor = pa.Tensor.from_numpy(dense_data)
+
+ # Test from numpy array
+ sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(dense_data)
+ repr(sparse_tensor)
+ result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(indptr, result_indptr)
+ assert np.array_equal(indices, result_indices)
+
+ # Test from Tensor
+ sparse_tensor = pa.SparseTensorCSR.from_tensor(tensor)
+ repr(sparse_tensor)
+ assert sparse_tensor.type == arrow_type
+ result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(indptr, result_indptr)
+ assert np.array_equal(indices, result_indices)
+
+
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
+def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type):
+ dtype = np.dtype(dtype_str)
+ data = np.array([[4], [9], [7], [5]]).astype(dtype)
+ coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]])
+ shape = (4, 4)
+ dim_names = ["x", "y"]
+
+ sparse_tensor = pa.SparseTensorCOO.from_numpy(data, coords, shape,
+ dim_names)
+ repr(sparse_tensor)
+ assert sparse_tensor.type == arrow_type
+ result_data, result_coords = sparse_tensor.to_numpy()
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(coords, result_coords)
+ assert sparse_tensor.dim_names == dim_names
+
+
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
+def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type):
+ dtype = np.dtype(dtype_str)
+ data = np.array([[1], [2], [3], [4], [5], [6]]).astype(dtype)
+ indptr = np.array([0, 2, 3, 6])
+ indices = np.array([0, 2, 2, 0, 1, 2])
+ shape = (3, 3)
+ dim_names = ["x", "y"]
+
+ sparse_tensor = pa.SparseTensorCSR.from_numpy(data, indptr, indices,
+ shape, dim_names)
+ repr(sparse_tensor)
+ assert sparse_tensor.type == arrow_type
+ result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
+ assert np.array_equal(data, result_data)
+ assert np.array_equal(indptr, result_indptr)
+ assert np.array_equal(indices, result_indices)
+ assert sparse_tensor.dim_names == dim_names
diff --git a/python/pyarrow/tests/test_tensor.py
b/python/pyarrow/tests/test_tensor.py
index 188a4a5..13f05d2 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -23,12 +23,28 @@ import numpy as np
import pyarrow as pa
+tensor_type_pairs = [
+ ('i1', pa.int8()),
+ ('i2', pa.int16()),
+ ('i4', pa.int32()),
+ ('i8', pa.int64()),
+ ('u1', pa.uint8()),
+ ('u2', pa.uint16()),
+ ('u4', pa.uint32()),
+ ('u8', pa.uint64()),
+ ('f2', pa.float16()),
+ ('f4', pa.float32()),
+ ('f8', pa.float64())
+]
+
+
def test_tensor_attrs():
data = np.random.randn(10, 4)
tensor = pa.Tensor.from_numpy(data)
assert tensor.ndim == 2
+ assert tensor.dim_names == []
assert tensor.size == 40
assert tensor.shape == data.shape
assert tensor.strides == data.strides
@@ -42,6 +58,13 @@ def test_tensor_attrs():
tensor = pa.Tensor.from_numpy(data2)
assert not tensor.is_mutable
+ # With dim_names
+ tensor = pa.Tensor.from_numpy(data, dim_names=('x', 'y'))
+ assert tensor.ndim == 2
+ assert tensor.dim_names == ['x', 'y']
+ assert tensor.dim_name(0) == 'x'
+ assert tensor.dim_name(1) == 'y'
+
def test_tensor_base_object():
tensor = pa.Tensor.from_numpy(np.random.randn(10, 4))
@@ -50,19 +73,7 @@ def test_tensor_base_object():
assert sys.getrefcount(tensor) == n + 1
[email protected]('dtype_str,arrow_type', [
- ('i1', pa.int8()),
- ('i2', pa.int16()),
- ('i4', pa.int32()),
- ('i8', pa.int64()),
- ('u1', pa.uint8()),
- ('u2', pa.uint16()),
- ('u4', pa.uint32()),
- ('u8', pa.uint64()),
- ('f2', pa.float16()),
- ('f4', pa.float32()),
- ('f8', pa.float64())
-])
[email protected]('dtype_str,arrow_type', tensor_type_pairs)
def test_tensor_numpy_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = (100 * np.random.randn(10, 4)).astype(dtype)
@@ -76,15 +87,6 @@ def test_tensor_numpy_roundtrip(dtype_str, arrow_type):
assert (data == result).all()
-def _try_delete(path):
- import gc
- gc.collect()
- try:
- os.remove(path)
- except os.error:
- pass
-
-
def test_tensor_ipc_roundtrip(tmpdir):
data = np.random.randn(10, 4)
tensor = pa.Tensor.from_numpy(data)