This problem was solved by the following commands:

sudo ln -s /usr/lib/nvidia-325/libcuda.so /usr/lib/libcuda.so
sudo ln -s /usr/lib/nvidia-325/libcuda.so.1 /usr/lib/libcuda.so.1

But I now get a message saying

TypeError: No registered converter was available to produce a C++ rvalue of
type unsigned int from this Python object of type float

when I run the matrix multiplication example code in PyCUDA's documentation.

This is an error I would see only on ArchLinux, but my Ubuntu laptop with
PyCUDA hasn't ever returned such an error. I have seen the mailing list and
others have had similar woes, but I couldn't find a working solution to
this problem. My code is reproduced below:

-----------------------------------------------------------------------------
# Output on laptop (Ubuntu 13.04 64-bit + python-pycuda from Ubuntu repos)

$ python matmult3.py
Enter matrix size: 4
Enter tile size: 2
('MATRIX SIZE = ', 4)
('TILE_WIDTH  = ', 2)
('CPU Time ', 1.7881393432617188e-05)
('GPU Time ', 0.00027298927307128906)
('L2 Norm: ', 0.0)
$
-----------------------------------------------------------------------------
# Output on desktop (Ubuntu 13.04 64-bit + nvidia-325 + CUDA from NVIDIA +
PyCUDA 2013 installed from the .tar.gz file)

$ python3 matmult3.py
Enter matrix size: 4
Enter tile size: 2
Traceback (most recent call last):
  File "matmult3.py", line 128, in <module>
    block = (TILE_SIZE, TILE_SIZE, 1),
  File
"/usr/local/lib/python3.3/dist-packages/pycuda-2013.1.1-py3.3-linux-x86_64.egg/pycuda/driver.py",
line 374, in function_call
    func._launch_kernel(grid, block, arg_buf, shared, None)
TypeError: No registered converter was able to produce a C++ rvalue of type
unsigned int from this Python object of type float
$
------------------------------

-----------------------------
CODE
-----------------------------------------------------------
# attempt to do matrix multiplication for complex numbers

import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools
import numpy as np
from numpy import linalg
from time import *

kernel_code_template = """
                #include <cuComplex.h>
__global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *B,
cuFloatComplex *C)
 {
      const uint wA = %(MATRIX_SIZE)s;
      const uint wB = %(MATRIX_SIZE)s;

      // Block index
      const uint bx = blockIdx.x;
      const uint by = blockIdx.y;

      // Thread index
      const uint tx = threadIdx.x;
      const uint ty = threadIdx.y;

      // Index of the first sub-matrix of A processed by the block
      const uint aBegin = wA * %(BLOCK_SIZE)s * by;
      // Index of the last sub-matrix of A processed by the block
      const uint aEnd   = aBegin + wA - 1;
      // Step size used to iterate through the sub-matrices of A
      const uint aStep = %(BLOCK_SIZE)s;

      // Index of the first sub-matrix of B processed by the block
      const int bBegin = %(BLOCK_SIZE)s * bx;
      // Step size used to iterate through the sub-matrcies of B
      const uint bStep = %(BLOCK_SIZE)s * wB;

      // The element of the block sub-matrix that is computed by the thread
      cuFloatComplex Csub = make_cuFloatComplex(0,0);
      // Loop over all the sub-matrices of A and B required to compute the
block sub-matrix
      for (int a = aBegin, b = bBegin;
           a <= aEnd;
   a += aStep, b += bStep)
      {
           // Shared memory for the sub-matrix of A
   __shared__ cuFloatComplex As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
   // Shared memory for the sub-matrix of B
   __shared__ cuFloatComplex Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

   // Load the matrices from global memory to shared memory;
   // each thread loads one element of each matrix
   As[ty][tx] = make_cuFloatComplex(cuCrealf(A[a + wA*ty +
tx]),cuCimagf(A[a + wA*ty + tx]));
   Bs[ty][tx] = make_cuFloatComplex(cuCrealf(B[b + wB*ty +
tx]),cuCimagf(B[b + wA*ty + tx]));

   // Synchronize to make sure the matrices are loaded
   __syncthreads();

   // Multiply the two matrcies together
   // each thread computes one element of the block sub-matrix
   for(int k = 0; k < %(BLOCK_SIZE)s; ++k)
   {
        Csub = cuCaddf(Csub,cuCmulf(As[ty][k],Bs[k][tx]));
   }

   // Synchronize to make sure that the preceding computation
   // is done before loading two new sub-matrices of A and B in the next
iteration
   __syncthreads();
     }

     // Write the block sub-matrix to global memory
     // each thread writes one element
     const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
     C[c + wB*ty + tx] = make_cuFloatComplex(cuCrealf(Csub),
cuCimagf(Csub));
}
"""

MATRIX_SIZE = int(input("Enter matrix size: "))
TILE_SIZE   = int(input("Enter tile size: "))

#MATRIX_SIZE = 16
#TILE_SIZE  = 4
BLOCK_SIZE = TILE_SIZE

## create two random square matrices
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.complex64)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.complex64)

#a_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex64)
#b_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex64)

#a_cpu[:,:] = 1 + 1j*0
#b_cpu[:,:] = 1 + 1j*0

# compute reference on the CPU to verify GPU computation
t1 = time()
c_cpu = np.dot(a_cpu, b_cpu)
t2 = time()
t_cpu = t2-t1

# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))
b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))

# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex64)

# get the kernel code from the template
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {
'MATRIX_SIZE': MATRIX_SIZE,
'BLOCK_SIZE': BLOCK_SIZE,
 }

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")

# call the kernel on the card
t1 = time()
matrixmul(
# inputs
 a_gpu, b_gpu,
# output
c_gpu,
# grid of multiple blocks
 grid = (MATRIX_SIZE/TILE_SIZE, MATRIX_SIZE/TILE_SIZE),
# block of multiple threads
block = (TILE_SIZE, TILE_SIZE, 1),
 )
t2 = time()
t_gpu = t2-t1

# print the results
#print("-" * 80)
#print("Matrix A (GPU): ")
#print(a_gpu.get())

#print("-" * 80)
#print("Matrix B (GPU): ")
#print(b_gpu.get())

#print("-" * 80)
#print("Matrix C (GPU): ")
#print(c_gpu.get())

#print("-" * 80)
#print("Matrix C (CPU): ")
#print(c_cpu)

#print("-" * 80)
#print("CPU-GPU Difference: ")
#print(c_cpu-c_gpu.get())

print("MATRIX SIZE = ", MATRIX_SIZE)
print("TILE_WIDTH  = ", TILE_SIZE)
print("CPU Time ", t_cpu)
print("GPU Time ", t_gpu)

print("L2 Norm: ", linalg.norm(c_cpu-c_gpu.get()))

np.allclose(c_cpu, c_gpu.get() )
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to