Re: [PyCUDA] Porting nvidia's separable convolution example to pycuda: C++ templates, loop unrolling

Andrew Wagner Sat, 13 Jun 2009 15:46:30 -0700

OK, I'm completely stuck on my port of the convolution example.

It seems no matter how I call the kernel it complains that my call
doesn't match the interface.  For most of the calling variations I
tried, the types it claimed I passed in were not the types I actually
passed in, which certainly isn't helping debugging.


to be precise, on line 337 of the attached convolution.py, I'm getting:

The debugged program raised the exception TypeError
"invalid type on parameter #2 (0-based)"
File: 
/usr/lib/python2.5/site-packages/pycuda-0.92-py2.5-linux-x86_64.egg/pycuda/driver.py,
Line: 78

The code shouldn't have any dependencies outside of pycuda and numpy,
so it should be easy to run.

Also, pycuda.Driver.Module.get_global seems to return a length 2
tuple, while pycuda.Driver.memcpy_htod expects the reference to be an
integer.  I got past this error by pulling out the first entry of the
tuple, which seems like the address, but I'm not sure if this is
correct.  This is for transferring the convolution kernel (the filter
parameters, not the cuda kernel) into constant memory.

I'm using pycuda 0.92 with CUDA 2.1 on Debian 5.0.1 with a kernel that
a friend had to help me recompile to be compatible with NVIDIA's
drivers.

Any ideas?

Thanks!
Drew

import numpy
#from helper_functions import *
#from plotting import *
import pycuda.autoinit
import pycuda.driver as cuda
import time
import string
#from database import imread,  imsave,  imshow

# Pull out a bunch of stuff that was hard coded as pre-processor directives used by both the kernel and calling code.
KERNEL_RADIUS = 8
UNROLL_INNER_LOOP = False
KERNEL_W = 2 * KERNEL_RADIUS + 1
ROW_TILE_W = 128
KERNEL_RADIUS_ALIGNED = 16
COLUMN_TILE_W = 16
COLUMN_TILE_H = 48
template = '''
//24-bit multiplication is faster on G80,
//but we must be sure to multiply integers
//only within [-8M, 8M - 1] range
#define IMUL(a, b) __mul24(a, b)

////////////////////////////////////////////////////////////////////////////////
// Kernel configuration
////////////////////////////////////////////////////////////////////////////////
#define KERNEL_RADIUS $KERNEL_RADIUS
#define KERNEL_W $KERNEL_W
__device__ __constant__ float d_Kernel_rows[KERNEL_W];
__device__ __constant__ float d_Kernel_columns[KERNEL_W];

// Assuming ROW_TILE_W, KERNEL_RADIUS_ALIGNED and dataW 
// are multiples of coalescing granularity size,
// all global memory operations are coalesced in convolutionRowGPU()
#define            ROW_TILE_W  $ROW_TILE_W
#define KERNEL_RADIUS_ALIGNED  $KERNEL_RADIUS_ALIGNED

// Assuming COLUMN_TILE_W and dataW are multiples
// of coalescing granularity size, all global memory operations 
// are coalesced in convolutionColumnGPU()
#define COLUMN_TILE_W $COLUMN_TILE_W
#define COLUMN_TILE_H $COLUMN_TILE_H'''

# Ignore the ugly templated unrolling code...
'''
////////////////////////////////////////////////////////////////////////////////
// Loop unrolling templates, needed for best performance
////////////////////////////////////////////////////////////////////////////////
template<int i> __device__ float convolutionRow(float *data){
    return
        data[KERNEL_RADIUS - i] * d_Kernel[i]
        + convolutionRow<i - 1>(data);
}

template<> __device__ float convolutionRow<-1>(float *data){
    return 0;
}

template<int i> __device__ float convolutionColumn(float *data){
    return 
        data[(KERNEL_RADIUS - i) * COLUMN_TILE_W] * d_Kernel[i]
        + convolutionColumn<i - 1>(data);
}

template<> __device__ float convolutionColumn<-1>(float *data){
    return 0;
}'''

template += '''
////////////////////////////////////////////////////////////////////////////////
// Row convolution filter
////////////////////////////////////////////////////////////////////////////////
__global__ void convolutionRowGPU(
    float *d_Result,
    float *d_Data,
    int dataW,
    int dataH
){
    //Data cache
    __shared__ float data[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];

    //Current tile and apron limits, relative to row start
    const int         tileStart = IMUL(blockIdx.x, ROW_TILE_W);
    const int           tileEnd = tileStart + ROW_TILE_W - 1;
    const int        apronStart = tileStart - KERNEL_RADIUS;
    const int          apronEnd = tileEnd   + KERNEL_RADIUS;

    //Clamp tile and apron limits by image borders
    const int    tileEndClamped = min(tileEnd, dataW - 1);
    const int apronStartClamped = max(apronStart, 0);
    const int   apronEndClamped = min(apronEnd, dataW - 1);

    //Row start index in d_Data[]
    const int          rowStart = IMUL(blockIdx.y, dataW);

    //Aligned apron start. Assuming dataW and ROW_TILE_W are multiples 
    //of half-warp size, rowStart + apronStartAligned is also a 
    //multiple of half-warp size, thus having proper alignment 
    //for coalesced d_Data[] read.
    const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;

    const int loadPos = apronStartAligned + threadIdx.x;
    //Set the entire data cache contents
    //Load global memory values, if indices are within the image borders,
    //or initialize with zeroes otherwise
    if(loadPos >= apronStart){
        const int smemPos = loadPos - apronStart;

        data[smemPos] = 
            ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ?
            d_Data[rowStart + loadPos] : 0;
    }

    //Ensure the completness of the loading stage
    //because results, emitted by each thread depend on the data,
    //loaded by another threads
    __syncthreads();
    const int writePos = tileStart + threadIdx.x;
    //Assuming dataW and ROW_TILE_W are multiples of half-warp size,
    //rowStart + tileStart is also a multiple of half-warp size,
    //thus having proper alignment for coalesced d_Result[] write.
    if(writePos <= tileEndClamped){
        const int smemPos = writePos - apronStart;
        float sum = 0;
'''
# Ignore ugly, broken loop unrolling
'''
#ifdef UNROLL_INNER
        sum = convolutionRow<2 * KERNEL_RADIUS>(data + smemPos);
#else
'''
originalLoop = '''
        for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
            sum += data[smemPos + k] * d_Kernel_rows[KERNEL_RADIUS - k];
'''
unrolledLoop = ''
for k in range(-KERNEL_RADIUS,  KERNEL_RADIUS+1):
    loopTemplate = string.Template('sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n')
    unrolledLoop += loopTemplate.substitute(k=k)    

#print unrolledLoop
template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
template += '''
        d_Result[rowStart + writePos] = sum;
    }
}

////////////////////////////////////////////////////////////////////////////////
// Column convolution filter
////////////////////////////////////////////////////////////////////////////////
__global__ void convolutionColumnGPU(
    float *d_Result,
    float *d_Data,
    int dataW,
    int dataH,
    int smemStride,
    int gmemStride
){
    //Data cache
    __shared__ float data[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];

    //Current tile and apron limits, in rows
    const int         tileStart = IMUL(blockIdx.y, COLUMN_TILE_H);
    const int           tileEnd = tileStart + COLUMN_TILE_H - 1;
    const int        apronStart = tileStart - KERNEL_RADIUS;
    const int          apronEnd = tileEnd   + KERNEL_RADIUS;

    //Clamp tile and apron limits by image borders
    const int    tileEndClamped = min(tileEnd, dataH - 1);
    const int apronStartClamped = max(apronStart, 0);
    const int   apronEndClamped = min(apronEnd, dataH - 1);

    //Current column index
    const int       columnStart = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;

    //Shared and global memory indices for current column
    int smemPos = IMUL(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
    int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
    //Cycle through the entire data cache
    //Load global memory values, if indices are within the image borders,
    //or initialize with zero otherwise
    for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
        data[smemPos] = 
        ((y >= apronStartClamped) && (y <= apronEndClamped)) ? 
        d_Data[gmemPos] : 0;
        smemPos += smemStride;
        gmemPos += gmemStride;
    }

    //Ensure the completness of the loading stage
    //because results, emitted by each thread depend on the data, 
    //loaded by another threads
    __syncthreads();
    //Shared and global memory indices for current column
    smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
    gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
    //Cycle through the tile body, clamped by image borders
    //Calculate and output the results
    for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
        float sum = 0;
'''
'''
#ifdef UNROLL_INNER
        sum = convolutionColumn<2 * KERNEL_RADIUS>(data + smemPos);
#else
'''
originalLoop = '''
        for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
            sum += data[smemPos + IMUL(k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - k];
'''
unrolledLoop = ''
for k in range(-KERNEL_RADIUS,  KERNEL_RADIUS+1):
    loopTemplate = string.Template('sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n')
    unrolledLoop += loopTemplate.substitute(k=k)    

#print unrolledLoop
template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
template += '''
        d_Result[gmemPos] = sum;
        smemPos += smemStride;
        gmemPos += gmemStride;
    }
}
'''
nvidia_separable_convolution_kernel_template = string.Template(template)
nvidia_separable_convolution_kernel_code = nvidia_separable_convolution_kernel_template.substitute(KERNEL_RADIUS = KERNEL_RADIUS,  KERNEL_W = KERNEL_W,  COLUMN_TILE_H=COLUMN_TILE_H,  COLUMN_TILE_W=COLUMN_TILE_W,  ROW_TILE_W=ROW_TILE_W,  KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)

module = cuda.SourceModule(nvidia_separable_convolution_kernel_code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')
d_Kernel_columns = module.get_global('d_Kernel_columns')
# HACK: The docs for module say it should return an int, but it's returning a tuple instead...  The first entry looks correct.
if type(d_Kernel_rows) == type((1, )):
    print 'Applying nasty hack to fix constant memory reference'
    d_Kernel_rows = d_Kernel_rows[0]
    d_Kernel_columns = d_Kernel_columns[0]

# Helper functions for computing alignment...
def iDivUp(a, b):
    # Round a / b to nearest higher integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a / b + 1) if (a % b != 0) else (a / b)

def iDivDown(a, b):
    # Round a / b to nearest lower integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return a / b;

def iAlignUp(a, b):
    # Align a to nearest higher multiple of b
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a - a % b + b) if (a % b != 0) else a

def iAlignDown(a, b):
    # Align a to nearest lower multiple of b
    a = numpy.int32(a)
    b = numpy.int32(b)
    return a - a % b

def gaussian_kernel(width = KERNEL_W, sigma = 4.0):
    assert width == numpy.floor(width),  'argument width should be an integer!'
    radius = (width - 1)/2.0
    x = numpy.linspace(-radius,  radius,  width)
    x = numpy.float32(x)
    sigma = numpy.float32(sigma)
    filter = x*x / (2 * sigma * sigma)
    filter = numpy.exp(-1 * filter)
    assert filter.sum()>0,  'something very wrong if gaussian kernel sums to zero!'
    filter /= filter.sum()
    return filter

def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4):
    assert width == numpy.floor(width),  'argument width should be an integer!'
    radius = (width - 1)/2.0
    x = numpy.linspace(-radius,  radius,  width)
    x = numpy.float32(x)
    # The derivative of a gaussian is really just a gaussian times x, up to scale.
    filter = gaussian_kernel(width,  sigma)
    filter *= x
    # Rescale so that filter returns derivative of 1 when applied to x:
    scale = (x * filter).sum()
    filter /= scale
    # Careful with sign; this will be uses as a ~convolution kernel, so should start positive, then go negative.
    filter *= -1.0
    return filter

def test_derivative_of_gaussian_kernel():
    width = 20
    sigma = 10.0
    filter = derivative_of_gaussian_kernel(width,  sigma)
    x = 2 * numpy.arange(0, width)
    x = numpy.float32(x)
    response = (filter * x).sum()
    assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
    width = 19
    sigma = 10.0
    filter = derivative_of_gaussian_kernel(width,  sigma)
    x = 2 * numpy.arange(0, width)
    x = numpy.float32(x)
    response = (filter * x).sum()
    assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'

def convolution_cuda(sourceImage,  filterx,  filtery):
    # Perform separable convolution on sourceImage using CUDA.
    # Operates on floating point images with row-major storage.
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32',  'source image must be float32'
    (imageHeight,  imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16);
    DATA_H = imageHeight;
    BYTES_PER_WORD = 4;  # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD; 
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD;
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns,  filtery)
    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
    DATA_H = int(DATA_H)
    DATA_W = int(DATA_W)
#    DATA_H = numpy.int32(DATA_H)
#    DATA_W = numpy.int32(DATA_W)
    convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=blockGridRows,  block=threadBlockRows)
    convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  COLUMN_TILE_W * threadBlockColumns[1],  DATA_W * threadBlockColumns[1],  grid=blockGridColumns,  block=threadBlockColumns)
    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage,  destImage_gpu)
    return destImage

testImageName = '/Volumes/Data/svn_images/complete_training/Zhenhui_Li/001_2008_11_12_19_06_05_front_001.bmp'
def test_convolution_cuda():
    # Test the convolution kernel.
    #original = imread(testImageName)
    original = numpy.random.rand(768,  1024) * 255
    #imshow(original)
    original = numpy.float32(original)
    filter = gaussian_kernel()
    destImage = original.copy()
    destImage[:] = numpy.nan
    convolution_cuda(original,  filter,  filter)
    destImage = numpy.uint8(destImage)
    #imshow(destImage)

if __name__ == '__main__':
    test_convolution_cuda()
    #test_derivative_of_gaussian_kernel()
    boo = raw_input('Pausing so you can look at results... <Enter> to finish...')

_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net

Re: [PyCUDA] Porting nvidia's separable convolution example to pycuda: C++ templates, loop unrolling

Reply via email to