On Sat, Jun 13, 2009 at 6:20 PM, Nicolas Pinto<[email protected]> wrote:
> Andrew,
>
> The following patch should make it work. PyCuda kernel functions take
> numpy.int32() whereas the grid should be int().
Thanks a lot, Nicolas! That got the kernel at least running. I'm
still getting garbage output, and I think it may be because my filter
kernel (filterx) is not making it into constant memory (under the
identifier d_Kernel_rows).
>> Also, pycuda.Driver.Module.get_global seems to return a length 2
>> tuple, while pycuda.Driver.memcpy_htod expects the reference to be an
>> integer. I got past this error by pulling out the first entry of the
>> tuple, which seems like the address, but I'm not sure if this is
>> correct. This is for transferring the convolution kernel (the filter
>> parameters, not the cuda kernel) into constant memory.
The declaration of the constant array is in the kernel source at line
29 of convolution.py:
__device__ __constant__ float d_Kernel_rows[KERNEL_W];
I get the address for the symbol d_Kernel_rows at line 231:
d_Kernel_rows = module.get_global('d_Kernel_rows')
I try to upload data to the array on line 327:
cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into
constant memory via a symbol defined in the kernel
I get the following error:
The debugged program raised the exception ArgumentError
"Python argument types in pycuda._driver.memcpy_htod(tuple,
numpy.ndarray) did not match C++ signature: memcpy_htod(unsigned int
dest, boost::python::api::object src, boost::python::api::object
stream=None)"
Here are some of the relevant variables from the debugger...
>>> d_Kernel_rows
(16778496, 68)
>>> type(d_Kernel_rows[0])
<type 'int'>
>>> type(d_Kernel_rows[1])
<type 'int'>
>>> filterx
array([ 0.01396019, 0.02230832, 0.03348875, 0.04722672, 0.06256524,
0.07786369, 0.09103188, 0.09997895, 0.10315263, 0.09997895,
0.09103188, 0.07786369, 0.06256524, 0.04722672, 0.03348875,
0.02230832, 0.01396019], dtype=float32)
>>> filterx.shape
(17,)
>>> KERNEL_W
17
Again, I have attached a stand-alone version of the code.
Thanks!
import numpy
#from helper_functions import *
#from plotting import *
import pycuda.autoinit
import pycuda.driver as cuda
import time
import string
# from database import imread, imsave, imshow
# Pull out a bunch of stuff that was hard coded as pre-processor directives used by both the kernel and calling code.
KERNEL_RADIUS = 8
UNROLL_INNER_LOOP = False
KERNEL_W = 2 * KERNEL_RADIUS + 1
ROW_TILE_W = 128
KERNEL_RADIUS_ALIGNED = 16
COLUMN_TILE_W = 16
COLUMN_TILE_H = 48
template = '''
//24-bit multiplication is faster on G80,
//but we must be sure to multiply integers
//only within [-8M, 8M - 1] range
#define IMUL(a, b) __mul24(a, b)
////////////////////////////////////////////////////////////////////////////////
// Kernel configuration
////////////////////////////////////////////////////////////////////////////////
#define KERNEL_RADIUS $KERNEL_RADIUS
#define KERNEL_W $KERNEL_W
__device__ __constant__ float d_Kernel_rows[KERNEL_W];
__device__ __constant__ float d_Kernel_columns[KERNEL_W];
// Assuming ROW_TILE_W, KERNEL_RADIUS_ALIGNED and dataW
// are multiples of coalescing granularity size,
// all global memory operations are coalesced in convolutionRowGPU()
#define ROW_TILE_W $ROW_TILE_W
#define KERNEL_RADIUS_ALIGNED $KERNEL_RADIUS_ALIGNED
// Assuming COLUMN_TILE_W and dataW are multiples
// of coalescing granularity size, all global memory operations
// are coalesced in convolutionColumnGPU()
#define COLUMN_TILE_W $COLUMN_TILE_W
#define COLUMN_TILE_H $COLUMN_TILE_H'''
# Ignore the ugly templated unrolling code...
'''
////////////////////////////////////////////////////////////////////////////////
// Loop unrolling templates, needed for best performance
////////////////////////////////////////////////////////////////////////////////
template<int i> __device__ float convolutionRow(float *data){
return
data[KERNEL_RADIUS - i] * d_Kernel[i]
+ convolutionRow<i - 1>(data);
}
template<> __device__ float convolutionRow<-1>(float *data){
return 0;
}
template<int i> __device__ float convolutionColumn(float *data){
return
data[(KERNEL_RADIUS - i) * COLUMN_TILE_W] * d_Kernel[i]
+ convolutionColumn<i - 1>(data);
}
template<> __device__ float convolutionColumn<-1>(float *data){
return 0;
}'''
template += '''
////////////////////////////////////////////////////////////////////////////////
// Row convolution filter
////////////////////////////////////////////////////////////////////////////////
__global__ void convolutionRowGPU(
float *d_Result,
float *d_Data,
int dataW,
int dataH
){
//Data cache
__shared__ float data[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
//Current tile and apron limits, relative to row start
const int tileStart = IMUL(blockIdx.x, ROW_TILE_W);
const int tileEnd = tileStart + ROW_TILE_W - 1;
const int apronStart = tileStart - KERNEL_RADIUS;
const int apronEnd = tileEnd + KERNEL_RADIUS;
//Clamp tile and apron limits by image borders
const int tileEndClamped = min(tileEnd, dataW - 1);
const int apronStartClamped = max(apronStart, 0);
const int apronEndClamped = min(apronEnd, dataW - 1);
//Row start index in d_Data[]
const int rowStart = IMUL(blockIdx.y, dataW);
//Aligned apron start. Assuming dataW and ROW_TILE_W are multiples
//of half-warp size, rowStart + apronStartAligned is also a
//multiple of half-warp size, thus having proper alignment
//for coalesced d_Data[] read.
const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;
const int loadPos = apronStartAligned + threadIdx.x;
//Set the entire data cache contents
//Load global memory values, if indices are within the image borders,
//or initialize with zeroes otherwise
if(loadPos >= apronStart){
const int smemPos = loadPos - apronStart;
data[smemPos] =
((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ?
d_Data[rowStart + loadPos] : 0;
}
//Ensure the completness of the loading stage
//because results, emitted by each thread depend on the data,
//loaded by another threads
__syncthreads();
const int writePos = tileStart + threadIdx.x;
//Assuming dataW and ROW_TILE_W are multiples of half-warp size,
//rowStart + tileStart is also a multiple of half-warp size,
//thus having proper alignment for coalesced d_Result[] write.
if(writePos <= tileEndClamped){
const int smemPos = writePos - apronStart;
float sum = 0;
'''
# Ignore ugly, broken loop unrolling
'''
#ifdef UNROLL_INNER
sum = convolutionRow<2 * KERNEL_RADIUS>(data + smemPos);
#else
'''
originalLoop = '''
for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
sum += data[smemPos + k] * d_Kernel_rows[KERNEL_RADIUS - k];
'''
unrolledLoop = ''
for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1):
loopTemplate = string.Template('sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n')
unrolledLoop += loopTemplate.substitute(k=k)
#print unrolledLoop
template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
template += '''
d_Result[rowStart + writePos] = sum;
}
}
////////////////////////////////////////////////////////////////////////////////
// Column convolution filter
////////////////////////////////////////////////////////////////////////////////
__global__ void convolutionColumnGPU(
float *d_Result,
float *d_Data,
int dataW,
int dataH,
int smemStride,
int gmemStride
){
//Data cache
__shared__ float data[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
//Current tile and apron limits, in rows
const int tileStart = IMUL(blockIdx.y, COLUMN_TILE_H);
const int tileEnd = tileStart + COLUMN_TILE_H - 1;
const int apronStart = tileStart - KERNEL_RADIUS;
const int apronEnd = tileEnd + KERNEL_RADIUS;
//Clamp tile and apron limits by image borders
const int tileEndClamped = min(tileEnd, dataH - 1);
const int apronStartClamped = max(apronStart, 0);
const int apronEndClamped = min(apronEnd, dataH - 1);
//Current column index
const int columnStart = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
//Shared and global memory indices for current column
int smemPos = IMUL(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
//Cycle through the entire data cache
//Load global memory values, if indices are within the image borders,
//or initialize with zero otherwise
for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
data[smemPos] =
((y >= apronStartClamped) && (y <= apronEndClamped)) ?
d_Data[gmemPos] : 0;
smemPos += smemStride;
gmemPos += gmemStride;
}
//Ensure the completness of the loading stage
//because results, emitted by each thread depend on the data,
//loaded by another threads
__syncthreads();
//Shared and global memory indices for current column
smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
//Cycle through the tile body, clamped by image borders
//Calculate and output the results
for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
float sum = 0;
'''
'''
#ifdef UNROLL_INNER
sum = convolutionColumn<2 * KERNEL_RADIUS>(data + smemPos);
#else
'''
originalLoop = '''
for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
sum += data[smemPos + IMUL(k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - k];
'''
unrolledLoop = ''
for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1):
loopTemplate = string.Template('sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n')
unrolledLoop += loopTemplate.substitute(k=k)
#print unrolledLoop
template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
template += '''
d_Result[gmemPos] = sum;
smemPos += smemStride;
gmemPos += gmemStride;
}
}
'''
nvidia_separable_convolution_kernel_template = string.Template(template)
nvidia_separable_convolution_kernel_code = nvidia_separable_convolution_kernel_template.substitute(KERNEL_RADIUS = KERNEL_RADIUS, KERNEL_W = KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)
module = cuda.SourceModule(nvidia_separable_convolution_kernel_code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')
d_Kernel_columns = module.get_global('d_Kernel_columns')
# HACK: The docs for module say it should return an int, but it's returning a tuple instead... The first entry looks correct.
#if type(d_Kernel_rows) == type((1, )):
# print 'Applying nasty hack to fix constant memory reference'
# d_Kernel_rows = d_Kernel_rows[0]
# d_Kernel_columns = d_Kernel_columns[0]
# Helper functions for computing alignment...
def iDivUp(a, b):
# Round a / b to nearest higher integer value
a = numpy.int32(a)
b = numpy.int32(b)
return (a / b + 1) if (a % b != 0) else (a / b)
def iDivDown(a, b):
# Round a / b to nearest lower integer value
a = numpy.int32(a)
b = numpy.int32(b)
return a / b;
def iAlignUp(a, b):
# Align a to nearest higher multiple of b
a = numpy.int32(a)
b = numpy.int32(b)
return (a - a % b + b) if (a % b != 0) else a
def iAlignDown(a, b):
# Align a to nearest lower multiple of b
a = numpy.int32(a)
b = numpy.int32(b)
return a - a % b
def gaussian_kernel(width = KERNEL_W, sigma = 4.0):
assert width == numpy.floor(width), 'argument width should be an integer!'
radius = (width - 1)/2.0
x = numpy.linspace(-radius, radius, width)
x = numpy.float32(x)
sigma = numpy.float32(sigma)
filter = x*x / (2 * sigma * sigma)
filter = numpy.exp(-1 * filter)
assert filter.sum()>0, 'something very wrong if gaussian kernel sums to zero!'
filter /= filter.sum()
return filter
def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4):
assert width == numpy.floor(width), 'argument width should be an integer!'
radius = (width - 1)/2.0
x = numpy.linspace(-radius, radius, width)
x = numpy.float32(x)
# The derivative of a gaussian is really just a gaussian times x, up to scale.
filter = gaussian_kernel(width, sigma)
filter *= x
# Rescale so that filter returns derivative of 1 when applied to x:
scale = (x * filter).sum()
filter /= scale
# Careful with sign; this will be uses as a ~convolution kernel, so should start positive, then go negative.
filter *= -1.0
return filter
def test_derivative_of_gaussian_kernel():
width = 20
sigma = 10.0
filter = derivative_of_gaussian_kernel(width, sigma)
x = 2 * numpy.arange(0, width)
x = numpy.float32(x)
response = (filter * x).sum()
assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
width = 19
sigma = 10.0
filter = derivative_of_gaussian_kernel(width, sigma)
x = 2 * numpy.arange(0, width)
x = numpy.float32(x)
response = (filter * x).sum()
assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
def convolution_cuda(sourceImage, filterx, filtery):
# Perform separable convolution on sourceImage using CUDA.
# Operates on floating point images with row-major storage.
destImage = sourceImage.copy()
assert sourceImage.dtype == 'float32', 'source image must be float32'
(imageHeight, imageWidth) = sourceImage.shape
assert filterx.shape == filtery.shape == (KERNEL_W, ) , 'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
filterx = numpy.float32(filterx)
filtery = numpy.float32(filtery)
DATA_W = iAlignUp(imageWidth, 16);
DATA_H = imageHeight;
BYTES_PER_WORD = 4; # 4 for float32
DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD;
KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD;
# Prepare device arrays
destImage_gpu = cuda.mem_alloc_like(destImage)
sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
cuda.memcpy_htod(sourceImage_gpu, sourceImage)
cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into constant memory via a symbol defined in the kernel
cuda.memcpy_htod(d_Kernel_columns, filtery)
# Call the kernels for convolution in each direction.
blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
threadBlockColumns = (COLUMN_TILE_W, 8, 1)
DATA_H = numpy.int32(DATA_H)
DATA_W = numpy.int32(DATA_W)
convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=[int(e) for e in blockGridRows], block=[int(e) for e in threadBlockRows])
# convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, COLUMN_TILE_W * threadBlockColumns[1], DATA_W * threadBlockColumns[1], grid=[int(e) for e in blockGridColumns], block=[int(e) for e in threadBlockColumns])
# Pull the data back from the GPU.
cuda.memcpy_dtoh(destImage, destImage_gpu)
#cuda.memcpy_dtoh(destImage, sourceImage_gpu)
return destImage
testImageName = '/Volumes/Data/svn_images/complete_training/Zhenhui_Li/001_2008_11_12_19_06_05_front_001.bmp'
def test_convolution_cuda():
# Test the convolution kernel.
#original = imread(testImageName)
original = numpy.random.rand(768, 1024) * 255
#print 'Showing image before filtering...'
#imshow(original)
original = numpy.float32(original)
filterx = gaussian_kernel()
#filterx = numpy.zeros(KERNEL_W, dtype='float32')
destImage = original.copy()
destImage[:] = numpy.nan
destImage = convolution_cuda(original, filterx, filterx)
#destImage = numpy.uint8(destImage)
#print'Showing image after filtering...'
#imshow(destImage)
if __name__ == '__main__':
test_convolution_cuda()
#test_derivative_of_gaussian_kernel()
#boo = raw_input('Pausing so you can look at results... <Enter> to finish...')
_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net