[incubator-mxnet] branch master updated: CSRNDArray from/to scipy csr_matrix; fix rand_shape_nd (#7638)

jxie Tue, 29 Aug 2017 22:12:46 -0700

This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new ec7cd6e  CSRNDArray from/to scipy csr_matrix; fix rand_shape_nd (#7638)
ec7cd6e is described below

commit ec7cd6eeb5f95f86b9d73250ba1f61616dd42800
Author: Haibin Lin <linhaibin.e...@gmail.com>
AuthorDate: Tue Aug 29 22:12:05 2017 -0700

    CSRNDArray from/to scipy csr_matrix; fix rand_shape_nd (#7638)
    
    * support creation from sp.csr
    
    * enhance doc
    
    * edit repr for sparse ndarray
    
    * update doc for nd.empty
    
    * preprocess noncanonical csr
    
    * add asscipy to csr
    
    * minor changes
    
    * return tuple for rand_shape_nd
    
    * fix lint
    
    * throw exception on setters
    
    * remove asscipy
    
    * global import scipy in sparse.py
    
    * update rand_shape_nd;
    
    * add missing line
    
    * better err msg. fix scipy import in utils.py
    
    * fix lint
---
 python/mxnet/ndarray/sparse.py               | 95 +++++++++++++++++++++++++---
 python/mxnet/ndarray/utils.py                | 24 ++++---
 python/mxnet/test_utils.py                   |  8 +--
 tests/python/unittest/test_io.py             |  1 -
 tests/python/unittest/test_module.py         |  7 +-
 tests/python/unittest/test_sparse_ndarray.py | 40 ++++++++++--
 6 files changed, 142 insertions(+), 33 deletions(-)

diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 806398e..fa2761d 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -51,7 +51,6 @@ from .ndarray import zeros as _zeros_ndarray
 from .ndarray import array as _array
 from . import op
 
-# Use different verison of SymbolBase
 # When possible, use cython to speedup part of computation.
 # pylint: disable=unused-import
 try:
@@ -67,6 +66,10 @@ except ImportError:
     from .._ctypes.ndarray import _set_ndarray_class
 # pylint: enable=unused-import
 
+try:
+    import scipy.sparse as spsp
+except ImportError:
+    spsp = None
 
 _STORAGE_AUX_TYPES = {
     'row_sparse': [np.int64],
@@ -112,6 +115,13 @@ class BaseSparseNDArray(NDArray):
     See CSRNDArray and RowSparseNDArray for more details.
     """
 
+    def __repr__(self):
+        """Returns a string representation of the sparse array."""
+        shape_info = 'x'.join(['%d' % x for x in self.shape])
+        # The data content is not displayed since the array usually has big 
shape
+        return '\n<%s %s @%s>' % (self.__class__.__name__,
+                                  shape_info, self.context)
+
     def __iadd__(self, other):
         raise NotImplementedError()
 
@@ -417,6 +427,19 @@ class CSRNDArray(BaseSparseNDArray):
         """
         return self._data()
 
+    @indices.setter
+    def indices(self, indices):
+        raise NotImplementedError()
+
+    @indptr.setter
+    def indptr(self, indptr):
+        raise NotImplementedError()
+
+    @data.setter
+    def data(self, data):
+        raise NotImplementedError()
+
+
     def tostype(self, stype):
         """Return a copy of the array with chosen storage type.
 
@@ -461,7 +484,6 @@ class CSRNDArray(BaseSparseNDArray):
         else:
             raise TypeError('copyto does not support type ' + str(type(other)))
 
-
 # pylint: disable=abstract-method
 class RowSparseNDArray(BaseSparseNDArray):
     """A sparse representation of a set of NDArray row slices at given indices.
@@ -630,6 +652,14 @@ class RowSparseNDArray(BaseSparseNDArray):
         """
         return self._data()
 
+    @indices.setter
+    def indices(self, indices):
+        raise NotImplementedError()
+
+    @data.setter
+    def data(self, data):
+        raise NotImplementedError()
+
     def tostype(self, stype):
         """Return a copy of the array with chosen storage type.
 
@@ -908,16 +938,61 @@ def empty(stype, shape, ctx=None, dtype=None, 
aux_types=None):
 
 def array(source_array, ctx=None, dtype=None, aux_types=None):
     """Creates a sparse array from any object exposing the array interface.
+
+    Parameters
+    ----------
+    source_array : RowSparseNDArray, CSRNDArray or scipy.sparse.csr.csr_matrix
+        The source sparse array
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is 
``source_array.dtype``
+        if `source_array` is an `NDArray`, `float32` otherwise.
+    aux_types: list of numpy.dtype, optional
+        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray.
+        The default value for CSRNDArray is [`int64`, `int64`] for `indptr` 
and `indices`.
+        The default value for RowSparseNDArray is [`int64`] for `indices`.
+
+    Returns
+    -------
+    RowSparseNDArray or CSRNDArray
+        An array with the same contents as the `source_array`.
+
+    Examples
+    --------
+    >>> import scipy.sparse as sp
+    >>> csr = sp.csr_matrix((2, 100))
+    >>> mx.nd.sparse.array(csr)
+    <CSRNDArray 2x100 @cpu(0)>
+    >>> mx.nd.sparse.array(mx.nd.zeros((3, 2), stype='csr'))
+    <CSRNDArray 3x2 @cpu(0)>
+    >>> mx.nd.sparse.array(mx.nd.zeros((3, 2), stype='row_sparse'))
+    <RowSparseNDArray 3x2 @cpu(0)>
     """
     if isinstance(source_array, NDArray):
-        assert(source_array.stype != 'default'),\
-            "Please use `cast_storage` to create BaseSparseNDArray from an 
NDArray"
+        assert(source_array.stype != 'default'), \
+               "Please use `cast_storage` to create RowSparseNDArray or 
CSRNDArray from an NDArray"
         dtype = source_array.dtype if dtype is None else dtype
         aux_types = source_array._aux_types if aux_types is None else aux_types
+        arr = empty(source_array.stype, source_array.shape, ctx, dtype, 
aux_types)
+        arr[:] = source_array
+        return arr
+    if spsp is not None and isinstance(source_array, spsp.csr.csr_matrix):
+        # TODO(haibin) implement `_sync_copy_from` with scipy csr object to 
reduce a copy
+        indptr_type = None
+        indices_type = None
+        if aux_types is not None:
+            assert(len(aux_types) == 2), "Expected types for both indices and 
indptr"
+            indptr_type = aux_types[0]
+            indices_type = aux_types[1]
+        # preprocess scipy csr to canonical form
+        csr = source_array.sorted_indices()
+        csr.sum_duplicates()
+        arr = csr_matrix(csr.data, csr.indptr, csr.indices, csr.shape, 
dtype=dtype,
+                         indptr_type=indptr_type, indices_type=indices_type)
+        return arr
+    elif isinstance(source_array, (np.ndarray, np.generic)):
+        raise ValueError("Please use mx.nd.array to create an NDArray with 
source_array of type ",
+                         type(source_array))
     else:
-        # TODO(haibin/anisub) support creation from scipy object when 
`_sync_copy_from` is ready
-        raise NotImplementedError('creating BaseSparseNDArray from '
-                                  ' a non-NDArray object is not implemented.')
-    arr = empty(source_array.stype, source_array.shape, ctx, dtype, aux_types)
-    arr[:] = source_array
-    return arr
+        raise ValueError("Unexpected source_array type: ", type(source_array))
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index a0dd836..231f984 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -28,6 +28,10 @@ from .sparse import zeros as _zeros_sparse_ndarray
 from .sparse import empty as _empty_sparse_ndarray
 from .sparse import array as _sparse_array
 from .sparse import _ndarray_cls
+try:
+    import scipy.sparse as spsp
+except ImportError:
+    spsp = None
 
 
 def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs):
@@ -44,8 +48,9 @@ def zeros(shape, ctx=None, dtype=None, stype=None, 
aux_types=None, **kwargs):
     stype: string, optional
         The storage type of the empty array, such as 'row_sparse', 'csr', etc.
     aux_types: list of numpy.dtype, optional
-        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray
-        (default values depend on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray.
+        The default value for CSRNDArray is [`int64`, `int64`] for `indptr` 
and `indices`.
+        The default value for RowSparseNDArray is [`int64`] for `indices`.
 
     Returns
     -------
@@ -79,8 +84,9 @@ def empty(shape, ctx=None, dtype=None, stype=None, 
aux_types=None):
     stype : str, optional
         An optional storage type (default is `default`).
     aux_types: list of numpy.dtype, optional
-        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray
-        (default values depend on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray.
+        The default value for CSRNDArray is [`int64`, `int64`] for `indptr` 
and `indices`.
+        The default value for RowSparseNDArray is [`int64`] for `indices`.
 
     Returns
     -------
@@ -118,8 +124,9 @@ def array(source_array, ctx=None, dtype=None, 
aux_types=None):
         The data type of the output array. The default dtype is 
``source_array.dtype``
         if `source_array` is an `NDArray`, `float32` otherwise.
     aux_types: list of numpy.dtype, optional
-        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray
-        (default values depend on the storage type)
+        An optional list of types of the aux data for RowSparseNDArray or 
CSRNDArray.
+        The default value for CSRNDArray is [`int64`, `int64`] for `indptr` 
and `indices`.
+        The default value for RowSparseNDArray is [`int64`] for `indices`.
 
     Returns
     -------
@@ -140,8 +147,9 @@ def array(source_array, ctx=None, dtype=None, 
aux_types=None):
     >>> mx.nd.array(mx.nd.zeros((3, 2), stype='row_sparse'))
     <RowSparseNDArray 3x2 @cpu(0)>
     """
-    # TODO(haibin/anisub) Check if input is scipy.sparse object with 
`scipy.sparse.issparse`
-    if isinstance(source_array, NDArray) and source_array.stype != 'default':
+    if spsp is not None and isinstance(source_array, spsp.csr.csr_matrix):
+        return _sparse_array(source_array, ctx=ctx, dtype=dtype, 
aux_types=aux_types)
+    elif isinstance(source_array, NDArray) and source_array.stype != 'default':
         return _sparse_array(source_array, ctx=ctx, dtype=dtype, 
aux_types=aux_types)
     else:
         return _array(source_array, ctx=ctx, dtype=dtype)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index e1210fb..4394172 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -124,8 +124,8 @@ def _get_uniform_dataset_csr(num_rows, num_cols, 
density=0.1, dtype=None):
     """
     _validate_csr_generation_inputs(num_rows, num_cols, density,
                                     distribution="uniform")
-    from scipy import sparse as sp
-    csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
+    from scipy import sparse as spsp
+    csr = spsp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
     result = mx.nd.sparse.csr_matrix(csr.data, csr.indptr, csr.indices,
                                      (num_rows, num_cols), dtype=dtype)
     return result
@@ -261,8 +261,8 @@ def rand_shape_3d(dim0=10, dim1=10, dim2=10):
     return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, 
dim2 + 1)
 
 
-def rand_shape_nd(n, dim=10):
-    return rnd.randint(1, dim+1, size=n)
+def rand_shape_nd(num_dim, dim=10):
+    return tuple(rnd.randint(1, dim+1, size=num_dim))
 
 
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index fb8aa2a..6ec462e 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -154,7 +154,6 @@ def test_NDArrayIter_h5py():
             assert(labelcount[i] == 100)
 
 def test_NDArrayIter_csr():
-    import scipy.sparse as sp
     # creating toy data
     num_rows = rnd.randint(5, 15)
     num_cols = rnd.randint(1, 20)
diff --git a/tests/python/unittest/test_module.py 
b/tests/python/unittest/test_module.py
index 8a5fd90..da02e8b 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -502,11 +502,8 @@ def test_factorization_machine_module():
     num_batches = 5
     batch_size = 64
     num_samples = batch_size * num_batches
-    import scipy.sparse as sp
-    # generate some random scipy csr data
-    csr_sp = sp.rand(num_samples, feature_dim, density=0.1, format='csr')
-    csr_nd = mx.nd.sparse.csr_matrix(csr_sp.data, csr_sp.indptr, 
csr_sp.indices,
-                              (num_samples, feature_dim))
+    # generate some random csr data
+    csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1)
     label = mx.nd.ones((num_samples,1))
     # the alternative is to use LibSVMIter
     train_iter = mx.io.NDArrayIter(data=csr_nd,
diff --git a/tests/python/unittest/test_sparse_ndarray.py 
b/tests/python/unittest/test_sparse_ndarray.py
index 35d9713..f96c94c 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -444,11 +444,7 @@ def test_sparse_nd_unsupported():
             pass
 
 def test_create_csr():
-    dim0 = 50
-    dim1 = 50
-    densities = [0, 0.01, 0.1, 0.2, 0.5]
-    for density in densities:
-        shape = rand_shape_2d(dim0, dim1)
+    def check_create_csr_from_nd(shape, density):
         matrix = rand_ndarray(shape, 'csr', density)
         data = matrix.data
         indptr = matrix.indptr
@@ -462,6 +458,40 @@ def test_create_csr():
         csr_copy = mx.nd.array(csr_created)
         assert(same(csr_copy.asnumpy(), csr_created.asnumpy()))
 
+    def check_create_csr_from_scipy(shape, density, f):
+        def assert_csr_almost_equal(nd, sp):
+            assert_almost_equal(nd.data.asnumpy(), sp.data)
+            assert_almost_equal(nd.indptr.asnumpy(), sp.indptr)
+            assert_almost_equal(nd.indices.asnumpy(), sp.indices)
+
+        try:
+            import scipy.sparse as sp
+            # random canonical csr
+            csr_sp = sp.rand(shape[0], shape[1], density, format="csr")
+            csr_nd = f(csr_sp)
+            assert_csr_almost_equal(csr_nd, csr_sp)
+            # non-canonical csr which contains duplicates and unsorted indices
+            indptr = np.array([0, 2, 3, 7])
+            indices = np.array([0, 2, 2, 0, 1, 2, 1])
+            data = np.array([1, 2, 3, 4, 5, 6, 1])
+            non_canonical_csr = sp.csr_matrix((data, indices, indptr), 
shape=(3, 3))
+            canonical_csr_nd = f(non_canonical_csr)
+            canonical_csr_sp = non_canonical_csr.copy()
+            canonical_csr_sp.sum_duplicates()
+            canonical_csr_sp.sort_indices()
+            assert_csr_almost_equal(canonical_csr_nd, canonical_csr_sp)
+        except ImportError:
+            print("Could not import scipy.sparse. Skipping unit tests for 
scipy csr creation")
+
+    dim0 = 50
+    dim1 = 50
+    densities = [0, 0.01, 0.1, 0.2, 0.5]
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        check_create_csr_from_nd(shape, density)
+        check_create_csr_from_scipy(shape, density, mx.nd.sparse.array)
+        check_create_csr_from_scipy(shape, density, mx.nd.array)
+
 
 def test_create_row_sparse():
     dim0 = 50

-- 
To stop receiving notification emails like this one, please contact
['"comm...@mxnet.apache.org" <comm...@mxnet.apache.org>'].

[incubator-mxnet] branch master updated: CSRNDArray from/to scipy csr_matrix; fix rand_shape_nd (#7638)

Reply via email to