Jaroslav Hajek wrote:
I think I looked into those sources briefly (because I also noticed that it implements basic ops), but indeed there seems to be nothing magical (read: no assembler). Some operations we need (such as transpose-multiply or diagonal scaling) were not even there, I think. That doesn't mean it's a bad idea to defer some operations to CXSparse (as you said, it is a dependency already), but I don't expect big gains unless there are some better algorithms.I didn't mean to imply that the current built-in sparse implementation is somehow slow (although perhaps there is room for optimizations). Block sparse storage is simply bound to beat the plain sparse one when the blocks get large enough. It's just that I didn't expect 10x10 blocks to be enough, given the m-code implementation.
Well its not hard to reimplement the cs_multiply mex file from CXSPARSE as octave code and I attach a cs_multiply oct-file to test... Running the example
n = [1e2, 5e2, 1e3, 1e3, 1e4, 1e4]; d = [0.01, 0.01, 0.001, 0.01, 0.001, 0.005]; for i = 1: length (n) fprintf ("N = %i, d = %f\n", n(i), d(i)); A = sprand (n(i), n(i), d(i)); B = sprand (n(i), n(i), d(i)); tic; C0 = A * B; toc tic; C1 = mymultiply (A, B); toc endfor I see the results N = 100, d = 0.010000 Elapsed time is 0.000342958 seconds. Elapsed time is 0.00414899 seconds. N = 500, d = 0.010000 Elapsed time is 0.00301409 seconds. Elapsed time is 0.00264595 seconds. N = 1000, d = 0.001000 Elapsed time is 0.000446094 seconds. Elapsed time is 0.000560952 seconds. N = 1000, d = 0.010000 Elapsed time is 0.0157239 seconds. Elapsed time is 0.0209789 seconds. N = 10000, d = 0.001000 Elapsed time is 0.317249 seconds. Elapsed time is 0.374798 seconds. N = 10000, d = 0.005000 Elapsed time is 5.38696 seconds. Elapsed time is 9.66393 seconds. so it doesn't appear that using cs_multiply is an advantage D.
#include <octave/config.h> #include <octave/oct.h> #include <octave/oct-sparse.h> #ifdef IDX_TYPE_LONG #define CXSPARSE_DNAME(name) cs_dl ## name #define CXSPARSE_ZNAME(name) cs_cl ## name #else #define CXSPARSE_DNAME(name) cs_di ## name #define CXSPARSE_ZNAME(name) cs_ci ## name #endif #if defined(CS_VER) && (((CS_VER == 2) && (CS_SUBVER < 2)) || (CS_VER < 2)) typedef double _Complex cs_complex_t; #endif // The code below is a copy of the Sparse<T>::transpose() // method so that we can transpose directly from the CXSPARSE // matrix D into an Octave SparseMatrix template <class A, class B, class T> B transpose_cs (A D) { octave_idx_type nr = D->m; octave_idx_type nc = D->n; octave_idx_type nz = D->nzmax; B c (nc, nr, nz); for (octave_idx_type i = 0; i < nz; i++) c.xcidx (D->i [i] + 1)++; nz = 0; for (octave_idx_type i = 1; i <= nr; i++) { const octave_idx_type tmp = c.xcidx (i); c.xcidx (i) = nz; nz += tmp; } for (octave_idx_type j = 0; j < nc; j++) for (octave_idx_type k = D->p[j]; k < D->p[j+1]; k++) { octave_idx_type q = c.xcidx (D->i[k] + 1)++; c.xridx (q) = j; c.xdata (q) = D->x[k]; //c.xdata (q) = reinterpret_cast<T> (D->x[k]); } return c; } DEFUN_DLD (mymultiply, args, , "") { octave_value retval; int nargin = args.length (); if (nargin != 2) error ("incorrect number of arguments to the binary operator '*'"); else { if (args(0).is_complex_type () || args(1).is_complex_type ()) { SparseComplexMatrix a = args(0).sparse_complex_matrix_value (). transpose (); SparseComplexMatrix b = args(1).sparse_complex_matrix_value (). transpose (); if (!error_state) { CXSPARSE_ZNAME () A, B, *D; A.nzmax = a.nnz (); A.m = a.rows (); A.n = a.cols (); B.nzmax = b.nnz (); B.m = b.rows (); B.n = b.cols (); // Cast away const on A and B, with full knowledge that CSparse // won't touch it. Prevents the methods below making a copy // of the data. A.p = const_cast<octave_idx_type *>(a.cidx ()); A.i = const_cast<octave_idx_type *>(a.ridx ()); A.x = const_cast<cs_complex_t *> (reinterpret_cast<const cs_complex_t *> (a.data ())); A.nz = -1; B.p = const_cast<octave_idx_type *>(b.cidx ()); B.i = const_cast<octave_idx_type *>(b.ridx ()); B.x = const_cast<cs_complex_t *> (reinterpret_cast<const cs_complex_t *> (b.data ())); B.nz = -1; BEGIN_INTERRUPT_IMMEDIATELY_IN_FOREIGN_CODE; // D = B' * A' D = CXSPARSE_ZNAME (_multiply) (&B, &A); // drop zeros from D CXSPARSE_ZNAME (_dropzeros) (D); END_INTERRUPT_IMMEDIATELY_IN_FOREIGN_CODE; retval = transpose_cs <CXSPARSE_ZNAME() *, SparseComplexMatrix, Complex> (D);; CXSPARSE_ZNAME (_spfree) (D) ; } } else { SparseMatrix a = args(0).sparse_matrix_value (). transpose (); SparseMatrix b = args(1).sparse_matrix_value (). transpose (); if (!error_state) { CXSPARSE_DNAME () A, B, *D; A.nzmax = a.nnz (); A.m = a.rows (); A.n = a.cols (); B.nzmax = b.nnz (); B.m = b.rows (); B.n = b.cols (); // Cast away const on A and B, with full knowledge that CSparse // won't touch it. Prevents the methods below making a copy // of the data. A.p = const_cast<octave_idx_type *>(a.cidx ()); A.i = const_cast<octave_idx_type *>(a.ridx ()); A.x = const_cast<double *>(a.data ()); A.nz = -1; B.p = const_cast<octave_idx_type *>(b.cidx ()); B.i = const_cast<octave_idx_type *>(b.ridx ()); B.x = const_cast<double *>(b.data ()); B.nz = -1; BEGIN_INTERRUPT_IMMEDIATELY_IN_FOREIGN_CODE; // D = B' * A' D = CXSPARSE_DNAME (_multiply) (&B, &A); // drop zeros from D CXSPARSE_DNAME (_dropzeros) (D); END_INTERRUPT_IMMEDIATELY_IN_FOREIGN_CODE; retval = transpose_cs <CXSPARSE_DNAME() *, SparseMatrix, double> (D);; CXSPARSE_DNAME (_spfree) (D) ; } } } return retval; }
------------------------------------------------------------------------------ Download Intel® Parallel Studio Eval Try the new software tools for yourself. Speed compiling, find bugs proactively, and fine-tune applications for parallel performance. See why Intel Parallel Studio got high marks during beta. http://p.sf.net/sfu/intel-sw-dev
_______________________________________________ Octave-dev mailing list Octave-dev@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/octave-dev