Thanks. Actually I thought this problem is somehow related to the problem I
have, so I listed that simple demo.
The actual problem I have is to pass a complex scalar to a kernel, type is
defined by cuComplex.h, it sometimes gave me an incorrect result (I guess the
input argument struct is misaligned). I have attached a script to reproduce
this error.
Yiyin
#!/usr/bin/env python
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import numpy as np
import atexit
cuda.init()
ctx=cuda.Device(1).make_context()
atexit.register(ctx.pop)
d_A = gpuarray.zeros((1000,1000,3),np.complex64)
mod = SourceModule("""
#include <cuComplex.h>
__global__ void kernel (const int M, const int N, cuFloatComplex* dst, const int K, cuFloatComplex value)
{
//M is the number of rows, N is the number of columns
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int total = gridDim.x * blockDim.x;
for(int i = tid; i < M*N*K; i+=total)
{
dst[i] = value;
}
}
""")
func = mod.get_function("kernel")
func.prepare([np.int32, np.int32, np.intp, np.int32, np.complex64], (256,1,1))
func.prepared_call((84,1), d_A.shape[0], d_A.shape[1], d_A.gpudata, d_A.shape[2], 2+3j)
print "this will give you enties with 3+xxj, while the true input is 2+3j"
print d_A.get()[:,:,1]
#d_A's real part gets 3, imaginary part is missing
#calcsize('iiliF')=28
d_A.fill(0)
# change the order of const int K and the cuFloatComplex array
mod1 = SourceModule("""
#include <cuComplex.h>
__global__ void kernel (const int M, const int N, const int K, cuFloatComplex* dst, cuFloatComplex value)
{
//M is the number of rows, N is the number of columns
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int total = gridDim.x * blockDim.x;
for(int i = tid; i < M*N*K; i+=total)
{
dst[i] = value;
}
}
""")
func1 = mod1.get_function("kernel")
func1.prepare([np.int32, np.int32, np.int32, np.intp, np.complex64], (256,1,1))
func1.prepared_call((84,1), d_A.shape[0], d_A.shape[1], d_A.shape[2], d_A.gpudata, 2+3j)
print "this should give you enties with 2+3j"
print d_A.get()[:,:,1]
#d_A is correct
#calcsize('iiilF')=32
#This also happen to double precision complex number
#could the error occur in estimating the size of the input to the kernel?
#whenever calcsize(permuation of 'iiilF')=28, the result is incorrect, but when calcsize=32, it is correct.
On Sep 20, 2010, at 5:04 PM, Andreas Kloeckner wrote:
> On Fri, 17 Sep 2010 14:54:11 -0400, Yiyin Zhou <[email protected]> wrote:
>> Hi,
>> I was trying to pass some complex valued numbers to a kernel, but somehow it
>> messed up. Here is an example with GPUArray that can be reproduced on
>> several of our linux servers:
>>
>> initialize...
>>
>> import pycuda.gpuarray as gpuarray
>> import numpy as np
>> d_A = gpuarray.empty((1,128), np.complex64)
>> d_A.fill(1+2j)
>> d_A
>> The result is correct
>>
>> d_A.fill(np.complex64(1+2j))
>> d_A
>> the imaginary part of the resulting array are all zeros
>>
>> It's not necessarily a problem with complex64, in some kernels complex64 is
>> correct, but complex128 is not.
>> What could be the cause for that?
>
> Not my fault:
> http://projects.scipy.org/numpy/ticket/1617
> (just reported)
>
> Andreas
>
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda