This is an automated email from the ASF dual-hosted git repository. jxie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push: new ec7cd6e CSRNDArray from/to scipy csr_matrix; fix rand_shape_nd (#7638) ec7cd6e is described below commit ec7cd6eeb5f95f86b9d73250ba1f61616dd42800 Author: Haibin Lin <linhaibin.e...@gmail.com> AuthorDate: Tue Aug 29 22:12:05 2017 -0700 CSRNDArray from/to scipy csr_matrix; fix rand_shape_nd (#7638) * support creation from sp.csr * enhance doc * edit repr for sparse ndarray * update doc for nd.empty * preprocess noncanonical csr * add asscipy to csr * minor changes * return tuple for rand_shape_nd * fix lint * throw exception on setters * remove asscipy * global import scipy in sparse.py * update rand_shape_nd; * add missing line * better err msg. fix scipy import in utils.py * fix lint --- python/mxnet/ndarray/sparse.py | 95 +++++++++++++++++++++++++--- python/mxnet/ndarray/utils.py | 24 ++++--- python/mxnet/test_utils.py | 8 +-- tests/python/unittest/test_io.py | 1 - tests/python/unittest/test_module.py | 7 +- tests/python/unittest/test_sparse_ndarray.py | 40 ++++++++++-- 6 files changed, 142 insertions(+), 33 deletions(-) diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py index 806398e..fa2761d 100644 --- a/python/mxnet/ndarray/sparse.py +++ b/python/mxnet/ndarray/sparse.py @@ -51,7 +51,6 @@ from .ndarray import zeros as _zeros_ndarray from .ndarray import array as _array from . import op -# Use different verison of SymbolBase # When possible, use cython to speedup part of computation. # pylint: disable=unused-import try: @@ -67,6 +66,10 @@ except ImportError: from .._ctypes.ndarray import _set_ndarray_class # pylint: enable=unused-import +try: + import scipy.sparse as spsp +except ImportError: + spsp = None _STORAGE_AUX_TYPES = { 'row_sparse': [np.int64], @@ -112,6 +115,13 @@ class BaseSparseNDArray(NDArray): See CSRNDArray and RowSparseNDArray for more details. """ + def __repr__(self): + """Returns a string representation of the sparse array.""" + shape_info = 'x'.join(['%d' % x for x in self.shape]) + # The data content is not displayed since the array usually has big shape + return '\n<%s %s @%s>' % (self.__class__.__name__, + shape_info, self.context) + def __iadd__(self, other): raise NotImplementedError() @@ -417,6 +427,19 @@ class CSRNDArray(BaseSparseNDArray): """ return self._data() + @indices.setter + def indices(self, indices): + raise NotImplementedError() + + @indptr.setter + def indptr(self, indptr): + raise NotImplementedError() + + @data.setter + def data(self, data): + raise NotImplementedError() + + def tostype(self, stype): """Return a copy of the array with chosen storage type. @@ -461,7 +484,6 @@ class CSRNDArray(BaseSparseNDArray): else: raise TypeError('copyto does not support type ' + str(type(other))) - # pylint: disable=abstract-method class RowSparseNDArray(BaseSparseNDArray): """A sparse representation of a set of NDArray row slices at given indices. @@ -630,6 +652,14 @@ class RowSparseNDArray(BaseSparseNDArray): """ return self._data() + @indices.setter + def indices(self, indices): + raise NotImplementedError() + + @data.setter + def data(self, data): + raise NotImplementedError() + def tostype(self, stype): """Return a copy of the array with chosen storage type. @@ -908,16 +938,61 @@ def empty(stype, shape, ctx=None, dtype=None, aux_types=None): def array(source_array, ctx=None, dtype=None, aux_types=None): """Creates a sparse array from any object exposing the array interface. + + Parameters + ---------- + source_array : RowSparseNDArray, CSRNDArray or scipy.sparse.csr.csr_matrix + The source sparse array + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``source_array.dtype`` + if `source_array` is an `NDArray`, `float32` otherwise. + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray. + The default value for CSRNDArray is [`int64`, `int64`] for `indptr` and `indices`. + The default value for RowSparseNDArray is [`int64`] for `indices`. + + Returns + ------- + RowSparseNDArray or CSRNDArray + An array with the same contents as the `source_array`. + + Examples + -------- + >>> import scipy.sparse as sp + >>> csr = sp.csr_matrix((2, 100)) + >>> mx.nd.sparse.array(csr) + <CSRNDArray 2x100 @cpu(0)> + >>> mx.nd.sparse.array(mx.nd.zeros((3, 2), stype='csr')) + <CSRNDArray 3x2 @cpu(0)> + >>> mx.nd.sparse.array(mx.nd.zeros((3, 2), stype='row_sparse')) + <RowSparseNDArray 3x2 @cpu(0)> """ if isinstance(source_array, NDArray): - assert(source_array.stype != 'default'),\ - "Please use `cast_storage` to create BaseSparseNDArray from an NDArray" + assert(source_array.stype != 'default'), \ + "Please use `cast_storage` to create RowSparseNDArray or CSRNDArray from an NDArray" dtype = source_array.dtype if dtype is None else dtype aux_types = source_array._aux_types if aux_types is None else aux_types + arr = empty(source_array.stype, source_array.shape, ctx, dtype, aux_types) + arr[:] = source_array + return arr + if spsp is not None and isinstance(source_array, spsp.csr.csr_matrix): + # TODO(haibin) implement `_sync_copy_from` with scipy csr object to reduce a copy + indptr_type = None + indices_type = None + if aux_types is not None: + assert(len(aux_types) == 2), "Expected types for both indices and indptr" + indptr_type = aux_types[0] + indices_type = aux_types[1] + # preprocess scipy csr to canonical form + csr = source_array.sorted_indices() + csr.sum_duplicates() + arr = csr_matrix(csr.data, csr.indptr, csr.indices, csr.shape, dtype=dtype, + indptr_type=indptr_type, indices_type=indices_type) + return arr + elif isinstance(source_array, (np.ndarray, np.generic)): + raise ValueError("Please use mx.nd.array to create an NDArray with source_array of type ", + type(source_array)) else: - # TODO(haibin/anisub) support creation from scipy object when `_sync_copy_from` is ready - raise NotImplementedError('creating BaseSparseNDArray from ' - ' a non-NDArray object is not implemented.') - arr = empty(source_array.stype, source_array.shape, ctx, dtype, aux_types) - arr[:] = source_array - return arr + raise ValueError("Unexpected source_array type: ", type(source_array)) diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py index a0dd836..231f984 100644 --- a/python/mxnet/ndarray/utils.py +++ b/python/mxnet/ndarray/utils.py @@ -28,6 +28,10 @@ from .sparse import zeros as _zeros_sparse_ndarray from .sparse import empty as _empty_sparse_ndarray from .sparse import array as _sparse_array from .sparse import _ndarray_cls +try: + import scipy.sparse as spsp +except ImportError: + spsp = None def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs): @@ -44,8 +48,9 @@ def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs): stype: string, optional The storage type of the empty array, such as 'row_sparse', 'csr', etc. aux_types: list of numpy.dtype, optional - An optional list of types of the aux data for RowSparseNDArray or CSRNDArray - (default values depend on the storage type) + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray. + The default value for CSRNDArray is [`int64`, `int64`] for `indptr` and `indices`. + The default value for RowSparseNDArray is [`int64`] for `indices`. Returns ------- @@ -79,8 +84,9 @@ def empty(shape, ctx=None, dtype=None, stype=None, aux_types=None): stype : str, optional An optional storage type (default is `default`). aux_types: list of numpy.dtype, optional - An optional list of types of the aux data for RowSparseNDArray or CSRNDArray - (default values depend on the storage type) + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray. + The default value for CSRNDArray is [`int64`, `int64`] for `indptr` and `indices`. + The default value for RowSparseNDArray is [`int64`] for `indices`. Returns ------- @@ -118,8 +124,9 @@ def array(source_array, ctx=None, dtype=None, aux_types=None): The data type of the output array. The default dtype is ``source_array.dtype`` if `source_array` is an `NDArray`, `float32` otherwise. aux_types: list of numpy.dtype, optional - An optional list of types of the aux data for RowSparseNDArray or CSRNDArray - (default values depend on the storage type) + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray. + The default value for CSRNDArray is [`int64`, `int64`] for `indptr` and `indices`. + The default value for RowSparseNDArray is [`int64`] for `indices`. Returns ------- @@ -140,8 +147,9 @@ def array(source_array, ctx=None, dtype=None, aux_types=None): >>> mx.nd.array(mx.nd.zeros((3, 2), stype='row_sparse')) <RowSparseNDArray 3x2 @cpu(0)> """ - # TODO(haibin/anisub) Check if input is scipy.sparse object with `scipy.sparse.issparse` - if isinstance(source_array, NDArray) and source_array.stype != 'default': + if spsp is not None and isinstance(source_array, spsp.csr.csr_matrix): + return _sparse_array(source_array, ctx=ctx, dtype=dtype, aux_types=aux_types) + elif isinstance(source_array, NDArray) and source_array.stype != 'default': return _sparse_array(source_array, ctx=ctx, dtype=dtype, aux_types=aux_types) else: return _array(source_array, ctx=ctx, dtype=dtype) diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index e1210fb..4394172 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -124,8 +124,8 @@ def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None): """ _validate_csr_generation_inputs(num_rows, num_cols, density, distribution="uniform") - from scipy import sparse as sp - csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr") + from scipy import sparse as spsp + csr = spsp.rand(num_rows, num_cols, density, dtype=dtype, format="csr") result = mx.nd.sparse.csr_matrix(csr.data, csr.indptr, csr.indices, (num_rows, num_cols), dtype=dtype) return result @@ -261,8 +261,8 @@ def rand_shape_3d(dim0=10, dim1=10, dim2=10): return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1) -def rand_shape_nd(n, dim=10): - return rnd.randint(1, dim+1, size=n) +def rand_shape_nd(num_dim, dim=10): + return tuple(rnd.randint(1, dim+1, size=num_dim)) def np_reduce(dat, axis, keepdims, numpy_reduce_func): diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py index fb8aa2a..6ec462e 100644 --- a/tests/python/unittest/test_io.py +++ b/tests/python/unittest/test_io.py @@ -154,7 +154,6 @@ def test_NDArrayIter_h5py(): assert(labelcount[i] == 100) def test_NDArrayIter_csr(): - import scipy.sparse as sp # creating toy data num_rows = rnd.randint(5, 15) num_cols = rnd.randint(1, 20) diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index 8a5fd90..da02e8b 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -502,11 +502,8 @@ def test_factorization_machine_module(): num_batches = 5 batch_size = 64 num_samples = batch_size * num_batches - import scipy.sparse as sp - # generate some random scipy csr data - csr_sp = sp.rand(num_samples, feature_dim, density=0.1, format='csr') - csr_nd = mx.nd.sparse.csr_matrix(csr_sp.data, csr_sp.indptr, csr_sp.indices, - (num_samples, feature_dim)) + # generate some random csr data + csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1) label = mx.nd.ones((num_samples,1)) # the alternative is to use LibSVMIter train_iter = mx.io.NDArrayIter(data=csr_nd, diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py index 35d9713..f96c94c 100644 --- a/tests/python/unittest/test_sparse_ndarray.py +++ b/tests/python/unittest/test_sparse_ndarray.py @@ -444,11 +444,7 @@ def test_sparse_nd_unsupported(): pass def test_create_csr(): - dim0 = 50 - dim1 = 50 - densities = [0, 0.01, 0.1, 0.2, 0.5] - for density in densities: - shape = rand_shape_2d(dim0, dim1) + def check_create_csr_from_nd(shape, density): matrix = rand_ndarray(shape, 'csr', density) data = matrix.data indptr = matrix.indptr @@ -462,6 +458,40 @@ def test_create_csr(): csr_copy = mx.nd.array(csr_created) assert(same(csr_copy.asnumpy(), csr_created.asnumpy())) + def check_create_csr_from_scipy(shape, density, f): + def assert_csr_almost_equal(nd, sp): + assert_almost_equal(nd.data.asnumpy(), sp.data) + assert_almost_equal(nd.indptr.asnumpy(), sp.indptr) + assert_almost_equal(nd.indices.asnumpy(), sp.indices) + + try: + import scipy.sparse as sp + # random canonical csr + csr_sp = sp.rand(shape[0], shape[1], density, format="csr") + csr_nd = f(csr_sp) + assert_csr_almost_equal(csr_nd, csr_sp) + # non-canonical csr which contains duplicates and unsorted indices + indptr = np.array([0, 2, 3, 7]) + indices = np.array([0, 2, 2, 0, 1, 2, 1]) + data = np.array([1, 2, 3, 4, 5, 6, 1]) + non_canonical_csr = sp.csr_matrix((data, indices, indptr), shape=(3, 3)) + canonical_csr_nd = f(non_canonical_csr) + canonical_csr_sp = non_canonical_csr.copy() + canonical_csr_sp.sum_duplicates() + canonical_csr_sp.sort_indices() + assert_csr_almost_equal(canonical_csr_nd, canonical_csr_sp) + except ImportError: + print("Could not import scipy.sparse. Skipping unit tests for scipy csr creation") + + dim0 = 50 + dim1 = 50 + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + check_create_csr_from_nd(shape, density) + check_create_csr_from_scipy(shape, density, mx.nd.sparse.array) + check_create_csr_from_scipy(shape, density, mx.nd.array) + def test_create_row_sparse(): dim0 = 50 -- To stop receiving notification emails like this one, please contact ['"comm...@mxnet.apache.org" <comm...@mxnet.apache.org>'].