Received from Evgeny Lazutkin on Tue, Feb 25, 2014 at 03:18:18AM EST:
> Dear Lev, dear all,
> 
> the problem with DataTypeError I have solved. I did not mention that
> I pass float64 instead of float32.
> 
> In attach you will find the code, it works......but....GPU brings
> wrong solution. I print the results in program. They dont match.
> Could you explain why?

This is because arrays in numpy are row-major by default while CULA assumes the
data is column-major [1]. If you transpose the two input matrices, you should
obtain the correct result (transposed). Corrected code attached.

[1] http://www.culatools.com/cula_dense_programmers_guide/#column-major-ordering
-- 
Lev Givon
Bionet Group
http://www.columbia.edu/~lev/
http://lebedov.github.io/
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import pycuda.gpuarray as gpuarray
from scikits.cuda.cula import *
from scikits import *
from scipy import *



A = array([[ 5.18649167 , 0.         , 1.03279556 , 0.   ,      -0.14549722, 0.      ,    0. ,  0.   ,      0.     ,     0.  ,        0.    ,      0.        ],
 [-5.0819889  , 0.,          2.52459667,  0.,          0.64549722 , 0. ,         0.  , 0.,          0.      ,    0.     ,     0.      ,    0.        ],
 [ 9.01848057,  0. ,        -8.13118224,  0. ,         5.18649167,  0.  ,        0.  , 0.,          0. ,         0.     ,     0.   ,       0.        ],
 [-0.5      ,   4.43649167,  0. ,         1.03279556,  0.     ,    -0.14549722,   0. ,         0. ,         0.      ,    0.     ,     0.     ,    0.        ],
 [ 0.      ,   -5.0819889  ,-0.5,         1.77459667 , 0.    ,      0.64549722,   0. ,         0.    ,      0.       ,   0.     ,     0.      ,    0.        ],
 [ 0.         , 9.01848057  ,0.,         -8.13118224, -0.5  ,       4.43649167,   0. ,         0.    ,      0.       ,   0.    ,      0.      ,    0.        ],
 [ 0.        ,  0.          ,0.,          0.        ,  0.  ,        0. ,  5.18649167 , 0.      ,    1.03279556 , 0.       ,  -0.14549722 , 0.        ],
 [ 0.       ,   0.          ,0.,          0.        ,  0. ,         0. , -5.0819889  , 0.      ,    2.52459667 , 0.       ,   0.64549722 , 0.        ],
 [ 0.      ,    0.          ,0.,          0.        ,  0.,          0. ,  9.01848057 , 0.      ,   -8.13118224 , 0.       ,   5.18649167 , 0.        ],
 [ 0.     ,     0.       ,   0.      ,    0.     ,     0.        ,  0.      ,  -0.5  , 4.43649167,  0.       ,   1.03279556 , 0.      ,   -0.14549722] ,
[ 0.     ,     0.       ,   0.     ,     0.    ,      0.       ,   0.       ,   0. , -5.0819889 , -0.5     ,    1.77459667 , 0.      ,    0.64549722] ,
[ 0.    ,      0.     ,     0.    ,      0.  ,        0.      ,    0.      ,    0. ,  9.01848057 , 0.     ,    -8.13118224, -0.5    ,     4.43649167]], dtype = numpy.float32)


B = array([[-5.32379001 , 0.       ,   0.,          0.        ],
 [ 2.661895 ,   0.        ,  0.,          0.        ],
 [-5.32379001,  0.        ,  0. ,         0.        ],
 [ 0.        , -5.32379001,  0.  ,        0.        ],
 [ 0.        ,  2.661895  ,  0.   ,       0.        ],
 [ 0.        , -5.32379001,  0.    ,      0.        ],
 [ 0.        ,  0. ,        -5.32379001,  0.        ],
 [ 0.        ,  0. ,         2.661895  ,  0.        ],
 [ 0.        ,  0.,         -5.32379001,  0.        ],
 [ 0.        ,  0.,          0.        , -5.32379001],
 [ 0.        ,  0.,          0.        ,  2.661895  ],
 [ 0.        ,  0.,          0.        , -5.32379001]], dtype = numpy.float32)


def cpu_solve(k_,y_):

	start=drv.Event()
	end=drv.Event()
	start.record()

	result = np.linalg.solve(k_, y_)

	end.record()
	end.synchronize()
	print "numpy array time: %fs" %(start.time_till(end)*1e-3)
	 
	return result

def gpu_solve(k_,y_):
	k_gpu=gpuarray.to_gpu(k_.T.copy())
	y_gpu=gpuarray.to_gpu(y_.T.copy())
	(m,n)=k_.shape
	lda=m
	nrhs=shape(y_)[1]

	ipiv=np.empty(n,dtype=np.int32)
	ipiv_gpu=gpuarray.to_gpu(ipiv)
	ldb=y_.shape[0]

	start=drv.Event()
	end=drv.Event()
	start.record()

	culaDeviceSgesv(n, nrhs, k_gpu.ptr, lda, ipiv_gpu.ptr, y_gpu.ptr, ldb)

	end.record()
	end.synchronize()
	print "GPU array time: %fs" %(start.time_till(end)*1e-3)
	return  y_gpu.get().T.copy()

culaInitialize()
x_c = cpu_solve(A,B) # check: correct? ---> YES. 
x_g = gpu_solve(A,B) # check: corretc? ---> NO!
print x_c
print x_g
culaShutdown()
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to