Hallo, I have a Pycuda code, which deals with two kernels. Both kernels run well separately, but when I put them together, there is a memory problem "LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered". In the second kernel "DotKernel", I can't change the values of any shared array or global array. Could you please have a look at the code? Thank you very much!
Best regards, Morvan
import pycuda.autoinit from pycuda import driver, compiler, gpuarray, tools from pycuda.compiler import SourceModule import numpy as np from time import * import matplotlib.pyplot as plt import numpy as np import scipy.io as sio import time from scipy import signal from numba import jit import numba as nb kernel_code_template = """ #include <cuComplex.h> __global__ void OuterKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA2, cuFloatComplex *C) { const uint wB = %(MATRIX_SIZE_O)d; const uint bx = blockIdx.x; const uint by = blockIdx.y; const uint tx = threadIdx.x; const uint ty = threadIdx.y; __shared__ cuFloatComplex A_y ; __shared__ cuFloatComplex B_y ; __shared__ cuFloatComplex beta2; __shared__ cuFloatComplex C_add; const uint c = wB *by + bx; for (int index_r = 0; index_r < %(rSize)d; index_r ++) { for (int wA = 1; wA < %(tSize)d+1; wA ++) { const uint aBegin = %(tSize)d * %(BLOCK_SIZE)d * by; const uint aEnd = aBegin + wA -1 ; const uint aStep = %(BLOCK_SIZE)d; const int bBegin = %(tSize)d * %(BLOCK_SIZE)d * bx; const uint bStep = %(BLOCK_SIZE)d ; for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { A_y = make_cuFloatComplex(cuCrealf(A[a + wB*ty + tx]),cuCimagf(A[a + wB*ty + tx])); B_y = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wB*ty + tx])); C_add= cuCmulf(A_y,B_y); } beta2 = make_cuFloatComplex(cuCrealf(BETA2[%(tSize)d * index_r+wA-1]),cuCimagf(BETA2[%(tSize)d * index_r+wA-1])); C_add = cuCmulf(C_add, beta2); C[c] = cuCaddf(C[c], C_add); } } } __global__ void DotKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA, cuFloatComplex *C, cuFloatComplex *FFT,cuFloatComplex *CC) { const uint wA = %(MATRIX_SIZE_I)d; const uint wB = %(MATRIX_SIZE_O)d; const uint bx = blockIdx.x; const uint by = blockIdx.y; const uint aBegin = wA * %(BLOCK_SIZE)d * by; const uint aEnd = aBegin + wA - 1; const uint aStep = 1; const int bBegin = %(BLOCK_SIZE)d * bx; const uint bStep = wB; cuFloatComplex Csub = make_cuFloatComplex(0,0); cuFloatComplex Csub_1 = make_cuFloatComplex(0,0); cuFloatComplex Csub_2 = make_cuFloatComplex(0,0); __shared__ cuFloatComplex A_global[%(MATRIX_SIZE_I)d]; __shared__ cuFloatComplex C1[%(MATRIX_SIZE_I)d]; for (int index_r = 0; index_r < %(rSize)d; index_r ++) { for (int index_t = 0; index_t < %(tSize)d; index_t ++) { for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++) { A_global[i] = make_cuFloatComplex(cuCrealf(B[i*%(tSize)d + index_t]),cuCimagf(B[i*%(tSize)d + index_t])); } __syncthreads(); Csub = make_cuFloatComplex(0,0); for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep) { Csub = cuCaddf(Csub,cuCmulf(A_global[a], C[b])); __syncthreads(); } const uint c = wB * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx; C1[c] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub)); __syncthreads(); for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++) { A_global[i] = make_cuFloatComplex(0,0); A_global[i] = make_cuFloatComplex(cuCrealf(FFT[i*%(rSize)d + index_r]),cuCimagf(FFT[i*%(rSize)d + index_r])); } CC[c] = make_cuFloatComplex(0,0); __syncthreads(); CC[c] = cuCmulf(C1[c],A_global[c]); Csub_1 = cuCmulf(Csub,A_global[c]); __syncthreads(); BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0); Csub_1 = make_cuFloatComplex(0,0); for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1) { Csub_1 = cuCaddf(Csub_1, CC[i]); //BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]); } for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++) { A_global[i] = make_cuFloatComplex(0,0); A_global[i] = make_cuFloatComplex(cuCrealf(A[i*%(tSize)d + index_t]),cuCimagf(A[i*%(tSize)d + index_t])); } CC[c] = make_cuFloatComplex(0,0); __syncthreads(); //CC[c] = make_cuFloatComplex(cuCrealf(A_global[c]),cuCimagf(A_global[c])); CC[c] = cuCmulf(C1[c],A_global[c]); __syncthreads(); BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0); Csub_2 = make_cuFloatComplex(0,0); for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1) { Csub_2 = cuCaddf(Csub_2, CC[i]); //BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]); } //BETA[index_r* %(tSize)d + index_t] = make_cuFloatComplex(cuCrealf(Csub_2), cuCimagf(Csub_2)); BETA[index_r* %(tSize)d + index_t] = cuCdivf(Csub_1,Csub_2); } } } """ tic = time.time() iaa_iter = 4 c = 3e8 fc = 3.15e9 lam = c/fc Mt = 7 Mr = 5 rmax = 300 theta = (np.arange(-80, 80, 0.5))/360*2*np.pi r = np.arange(0, rmax, rmax/100) tsize = theta.size rsize = r.size fmcw_fft = np.arange(0,Mt*Mr*theta.size) + 1j*1 fmcw_fft = fmcw_fft.reshape(Mt*Mr, theta.size).astype(np.complex64) VX = np.zeros(Mt*Mr) for i in range(0, Mt*Mr+1, 1): VX[i-1] = i*lam/2 B = np.zeros((Mt*Mr, theta.size), dtype=complex) for index_theta in range(0, theta.size, 1): alpha = np.exp(1j*2*np.pi*fc*np.sin(theta[index_theta])/c*VX) if iaa_iter == 0: B[:, index_theta] = np.multiply(alpha, signal.nuttall(Mr*Mt)) else: B[:, index_theta] = alpha beta = np.zeros((r.size, theta.size), dtype=complex) for index_r in range(0, r.size): for index_theta in range(0, theta.size): beta[index_r, index_theta] = np.dot(np.conj(B[:, index_theta]), fmcw_fft[:, index_r]) / np.dot(np.conj(B[:, index_theta]), B[:, index_theta]) RZero = np.zeros((Mr*Mt, Mr*Mt), dtype=complex) beta = beta.astype(np.complex64) BETA2 = (np.square(np.abs(beta)) + 1j*0).astype(np.complex64) Bcon = np.conj(B) MATRIX_SIZE_O = 35 MATRIX_SIZE_I = 35 BLOCK_SIZE = 1 GRID_X = MATRIX_SIZE_O GRID_Y = MATRIX_SIZE_O B = np.real(B).astype(np.float16) + 1j*np.imag(B).astype(np.float16) a_cpu = B Bcon = np.real(Bcon).astype(np.float16) + 1j*np.imag(Bcon).astype(np.float16) b_cpu = Bcon FFT_cpu = np.real(fmcw_fft).astype(np.float16) + 1j*np.imag(fmcw_fft).astype(np.float16) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) BETA2_gpu = gpuarray.to_gpu(BETA2) c_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64) FFT_gpu = gpuarray.to_gpu(FFT_cpu) kernel_code = kernel_code_template % { 'MATRIX_SIZE_I': MATRIX_SIZE_I, 'MATRIX_SIZE_O': MATRIX_SIZE_O, 'BLOCK_SIZE': BLOCK_SIZE, 'tSize': tsize, 'rSize': rsize, } mod = compiler.SourceModule(kernel_code) outer = mod.get_function("OuterKernel") t1 = time.time() outer( a_gpu, b_gpu, BETA2_gpu, c_gpu, grid = (GRID_X, GRID_Y), block = (BLOCK_SIZE, BLOCK_SIZE, 1), ) t2 = time.time() t_gpu = t2-t1 print('INFORMATION:',pycuda.driver.mem_get_info()) print('t_gpu1 = ', t_gpu) aa_gpu = gpuarray.to_gpu(a_cpu) bb_gpu = gpuarray.to_gpu(b_cpu) BETA2_gpu = gpuarray.empty((rsize, tsize), np.complex64) rinv_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64) rinv_gpu = gpuarray.to_gpu(c_gpu.get()) cc_gpu = gpuarray.empty((1,MATRIX_SIZE_O), np.complex64) mod = compiler.SourceModule(kernel_code) dot = mod.get_function("DotKernel") t1 = time.time() dot( aa_gpu, bb_gpu, BETA2_gpu, rinv_gpu, FFT_gpu, cc_gpu, grid = (GRID_X, GRID_Y), block = (BLOCK_SIZE, BLOCK_SIZE, 1), ) t2 = time.time() t_gpu = t2-t1 print('t_gpu2 = ', t_gpu) beta = BETA2_gpu.get() toc = time.time() print(toc-tic) beta[1:12, :] = 0 plt.contourf(np.outer(r, np.sin(theta)), np.outer(r, np.cos(theta)), 20*np.log10(np.abs(beta))) plt.show()
smime.p7s
Description: S/MIME Cryptographic Signature
_______________________________________________ PyCUDA mailing list -- pycuda@tiker.net To unsubscribe send an email to pycuda-le...@tiker.net