Hello everybody,

As suggested I worked on matrixMul in order to have my custom gpudot. I've attached the test file.
By trial and error I figured that I could not use block sizes bigger than 22. Is this imposed by graphic card type/version or am I missing something important in the example's code? (as you may have noticed I did not need to change much to obtain the same result an in numpy)

Best regards,
Rui

import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import random
from functools import partial

def getkernel(blocksize = 16,wb = 17):
    mod = SourceModule(
    """
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation and
 * any modifications thereto.  Any use, reproduction, disclosure, or distribution
 * of this software and related documentation without an express license
 * agreement from NVIDIA Corporation is strictly prohibited.
 *
 */

/* Matrix multiplication: C = A * B.
 * Device code.
 */

#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_

#include <stdio.h>
        // Thread block size
#define BLOCK_SIZE %i

// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA BLOCK_SIZE //(3 * BLOCK_SIZE) // Matrix A width
#define HA 1 //(5 * BLOCK_SIZE) // Matrix A height
#define WB %i //(8 * BLOCK_SIZE) // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height


#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif

////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
__global__ void
matrixMul( float* C, float* A, float* B, int wA, int wB)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    int aBegin = wA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {

        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the shared memory array Bs used to
        // store the sub-matrix of B
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + wA * ty + tx];
        BS(ty, tx) = B[b + wB * ty + tx];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    C[c + wB * ty + tx] = Csub;
}

#endif // #ifndef _MATRIXMUL_KERNEL_H_
        """%(blocksize, wb))
    BLOCK_SIZE = (blocksize,blocksize,1)
    GRID_SIZE = (wb,1)
    return partial(mod.get_function("matrixMul"),
                   block = BLOCK_SIZE,
                   grid = GRID_SIZE)


if __name__ == '__main__':
    SIZEA = 22 #maximum by trial and error
    SIZEB = 256
    ITERS = 2000
    a = np.array([random.random() for i in range(SIZEA)], dtype = np.float32)
    b = np.array([[random.random() for i in range(SIZEB)]
                  for j in range(SIZEA)], dtype = np.float32)
    dest = np.zeros_like(b[0])

    gpu_a = gpuarray.to_gpu(a)
    gpu_b = gpuarray.to_gpu(b)
    gpudest = gpuarray.to_gpu(dest)

    fun = getkernel( SIZEA, SIZEB )
    fun( gpudest, gpu_a, gpu_b, np.int32(SIZEA), np.int32(SIZEB))
    print (gpudest.get() - np.dot(a,b))[::7]



_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to