Hi,
I'm attempting to write code which will allocate block and grid sizes based
upon the machines capability for a large number of computations, large enough
that I want to know the device's limitations:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
a = numpy.random.randn(131072,2).astype(numpy.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mod = SourceModule("""
__global__ void doublify(float *a)
{int thread =threadIdx.x+threadIdx.y*blockDim.x;
int block=blockIdx.x*blockDim.x*blockDim.y;
int grid=blockIdx.y*gridDim.x*blockDim.x*blockDim.y;
int idx = thread+block+grid;
a[idx] *= 2;
}
""")
func = mod.get_function("doublify")
func(a_gpu, block=(2,cuda.device_attribute.MAX_THREADS_PER_BLOCK,1),
grid=(cuda.device_attribute.MAX_GRID_DIM_X,cuda.device_attribute.MAX_GRID_DIM_Y))
a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print a
print a_doubled
returns:
[[-0.51754272 -0.4062421 ]
[-0.72260356 -0.98106903]
[-0.10904041 0.82718426]
...,
[ 0.25420722 -0.59294581]
[-1.18791282 -0.49158984]
[ 0.45278689 0.69320816]]
[[-1.03508544 -0.8124842 ]
[-1.44520712 -1.96213806]
[-0.21808082 1.65436852]
...,
[ 0.25420722 -0.59294581]
[-1.18791282 -0.49158984]
[ 0.45278689 0.69320816]]
It's only done the computation for the first thirty lines. However the
following code works:
import pycuda.driver as cudaimport pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
a = numpy.random.randn(131072,2).astype(numpy.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mod = SourceModule("""
__global__ void doublify(float *a)
{int thread =threadIdx.x+threadIdx.y*blockDim.x;
int block=blockIdx.x*blockDim.x*blockDim.y;
int grid=blockIdx.y*gridDim.x*blockDim.x*blockDim.y;
int idx = thread+block+grid;
a[idx] *= 2;
}
""")
func = mod.get_function("doublify")
func(a_gpu, block=(2,512,1), grid=(16,16))
a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print a
print a_doubled
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
a = numpy.random.randn(131072,2).astype(numpy.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mod = SourceModule("""
__global__ void doublify(float *a)
{int thread =threadIdx.x+threadIdx.y*blockDim.x;
int block=blockIdx.x*blockDim.x*blockDim.y;
int grid=blockIdx.y*gridDim.x*blockDim.x*blockDim.y;
int idx = thread+block+grid;
a[idx] *= 2;
}
""")
func = mod.get_function("doublify")
func(a_gpu, block=(2,512,1), grid=(16,16))
a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print a
print a_doubled
Also, print cuda.device_attribute.MAX_THREADS_PER_BLOCK returns the string
MAX_THREADS_PER_BLOCK.
I'm using pyCUDA 0.94.2
and CUDA 3.2.0
Any suggestions would be greatly appreciated,
-drp
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda