Hello everybody, As suggested I worked on matrixMul in order to have my custom gpudot. I've attached the test file. By trial and error I figured that I could not use block sizes bigger than 22. Is this imposed by graphic card type/version or am I missing something important in the example's code? (as you may have noticed I did not need to change much to obtain the same result an in numpy) Rui |
import pycuda.autoinit import pycuda.driver as drv from pycuda import gpuarray from pycuda.compiler import SourceModule import numpy as np import random from functools import partial
def getkernel(blocksize = 16,wb = 17):
mod = SourceModule(
"""
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation and
* any modifications thereto. Any use, reproduction, disclosure, or distribution
* of this software and related documentation without an express license
* agreement from NVIDIA Corporation is strictly prohibited.
*
*/
/* Matrix multiplication: C = A * B.
* Device code.
*/
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
// Thread block size
#define BLOCK_SIZE %i
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA BLOCK_SIZE //(3 * BLOCK_SIZE) // Matrix A width
#define HA 1 //(5 * BLOCK_SIZE) // Matrix A height
#define WB %i //(8 * BLOCK_SIZE) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
__global__ void
matrixMul( float* C, float* A, float* B, int wA, int wB)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
"""%(blocksize, wb))
BLOCK_SIZE = (blocksize,blocksize,1)
GRID_SIZE = (wb,1)
return partial(mod.get_function("matrixMul"),
block = BLOCK_SIZE,
grid = GRID_SIZE)
if __name__ == '__main__':
SIZEA = 22 #maximum by trial and error
SIZEB = 256
ITERS = 2000
a = np.array([random.random() for i in range(SIZEA)], dtype = np.float32)
b = np.array([[random.random() for i in range(SIZEB)]
for j in range(SIZEA)], dtype = np.float32)
dest = np.zeros_like(b[0])
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
gpudest = gpuarray.to_gpu(dest)
fun = getkernel( SIZEA, SIZEB )
fun( gpudest, gpu_a, gpu_b, np.int32(SIZEA), np.int32(SIZEB))
print (gpudest.get() - np.dot(a,b))[::7]_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
