Dnia 2010-09-28, wto o godzinie 23:56 +0200, Tomasz Rybak pisze: > Dnia 2010-09-28, wto o godzinie 00:29 -0700, jmcarval pisze: > > Thanks for your reply. > > I've read the first thread you mention, that ends without a solution > > http://pycuda.2962900.n2.nabble.com/PyCUDA-pycuda-test-failures-tp5320194p5320194.html > > > > Maybe I'm doing a huge mistake but it does not seem to be a precision > > detail. > > The following code (a simplification of test_gpuarray), returns 30 from the > > CPU and 14 from the GTX480, either with integer, float32 or float64. > > I don't get it. Can anybody explain me what I'm doing wrong please? > > Thanks > > > > import pycuda.autoinit > > import numpy > > import pycuda.gpuarray as gpuarray > > from pycuda.curandom import rand as curand > > > > a = numpy.array([1,2,3,4])#.astype(numpy.float32) > > a_gpu = gpuarray.to_gpu(a) > > b = a > > b_gpu = gpuarray.to_gpu(b) > > > > dot_ab = numpy.dot(a, b) > > > > dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() > > > > print "CPU dot product:", dot_ab > > print "GPU dot product:", dot_ab_gpu > > > > > > I have idea for (maybe) checking whether problem is with PyCUDA, > CUDA toolkit, or driver. > Can you force PyCUDA to generate not sm_20 code, but 1x? > I have found that it is determined in line 190 of file > pycuda/compiler.py: > arch = "sm_%d%d" % Context.get_device().compute_capability() > Try to change it to > arch = "sm_10" > and so on, and check whether you get incorrect 14 in such > a case. > > If there is simpler way of changing architecture to which > PyCUDA generates code, feel free to use it and share this > information.
I started analysing assembly (PTX) generated by nvcc. Short analysis follows - no solution so far, but I found difference (probably caused by optimisation) in generated PTX. 64-bit Debian, CUDA 3.1, drivers 256.53. I have not tried CUDA 3.2 - it requires drivers 260.24, and I do not want mess to much with system and installing unpackaged drivers. ReductionKernel stage1 generated by GPUArray.dot is in file stage1.cu. It is not so interesting function that just reduces array in tree structure with some loops and conditions. Loop's exit condition is guarded by s and seq_count. Code generated for sm_11 is in stage1-sm11.cu Not so interesting. Code generated for sm_20 is in stage1-sm20.cu Also not so interesting, but there are differences between those PTX generated for sm_11 and sm_20. One is that tid is 32-bit on sm_20. (lines 64 and 65) Another (might be cause of this error) is that for sm_11 values of s and seq_count (held in registers %r9 and %r1 respectively) are copied to registers during each loop. In file stage1-sm1.cu those are in lines 72 and 80. There is not copying of those values in sm_20 - diff on those two files shows that it is the main difference! Could this be reason? I tried to disable optimisation (-O0 given to nvcc in compiler.py) but did not succeeded. Hope it gives someone a clue. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
extern "C" {
#define BLOCK_SIZE 512
#define READ_AND_MAP(i) (a[i]*b[i])
#define REDUCE(a, b) (a+b)
typedef long out_type;
__global__ void reduce_kernel_stage1(out_type *out, const long *a,
const long *b,
unsigned int seq_count, unsigned int n)
{
__shared__ out_type sdata[BLOCK_SIZE];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid;
out_type acc = 0;
for (unsigned s = 0; s < seq_count; ++s)
{
if (i >= n)
break;
acc = REDUCE(acc, READ_AND_MAP(i));
i += BLOCK_SIZE;
}
sdata[tid] = acc;
__syncthreads();
#if (BLOCK_SIZE >= 512)
if (tid < 256) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 256]);
}
__syncthreads();
#endif
#if (BLOCK_SIZE >= 256)
if (tid < 128) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 128]);
}
__syncthreads();
#endif
#if (BLOCK_SIZE >= 128)
if (tid < 64) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 64]); }
__syncthreads();
#endif
if (tid < 32)
{
if (BLOCK_SIZE >= 64) sdata[tid] = REDUCE(sdata[tid], sdata[tid +
32]);
if (BLOCK_SIZE >= 32) sdata[tid] = REDUCE(sdata[tid], sdata[tid +
16]);
if (BLOCK_SIZE >= 16) sdata[tid] = REDUCE(sdata[tid], sdata[tid +
8]);
if (BLOCK_SIZE >= 8) sdata[tid] = REDUCE(sdata[tid], sdata[tid +
4]);
if (BLOCK_SIZE >= 4) sdata[tid] = REDUCE(sdata[tid], sdata[tid +
2]);
if (BLOCK_SIZE >= 2) sdata[tid] = REDUCE(sdata[tid], sdata[tid +
1]);
}
if (tid == 0) out[blockIdx.x] = sdata[0];
}
}
.version 1.4
.target sm_11, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 3.1 built on 2010-06-07
//-----------------------------------------------------------
// Compiling kernel.cpp3.i (/tmp/ccBI#.7MMQwv)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_11, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/bin/../include/host_defines.h"
.file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 7 "/usr/local/cuda/bin/../include/device_types.h"
.file 8 "/usr/local/cuda/bin/../include/driver_types.h"
.file 9 "/usr/local/cuda/bin/../include/surface_types.h"
.file 10 "/usr/local/cuda/bin/../include/texture_types.h"
.file 11 "/usr/local/cuda/bin/../include/vector_types.h"
.file 12
"/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "kernel.cu"
.file 17
"/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 18 "/usr/local/cuda/bin/../include/common_functions.h"
.file 19 "/usr/local/cuda/bin/../include/math_functions.h"
.file 20 "/usr/local/cuda/bin/../include/math_constants.h"
.file 21 "/usr/local/cuda/bin/../include/device_functions.h"
.file 22
"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 23
"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 24
"/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 25
"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
.file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
.file 27 "/usr/local/cuda/bin/../include/surface_functions.h"
.file 28
"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.entry reduce_kernel_stage1 (
.param .u64 __cudaparm_reduce_kernel_stage1_out,
.param .u64 __cudaparm_reduce_kernel_stage1_a,
.param .u64 __cudaparm_reduce_kernel_stage1_b,
.param .u32 __cudaparm_reduce_kernel_stage1_seq_count,
.param .u32 __cudaparm_reduce_kernel_stage1_n)
{
.reg .u32 %r<18>;
.reg .u64 %rd<43>;
.reg .pred %p<10>;
.shared .align 8 .b8 __cuda___cuda_local_var_20967_44_sdata32[4096];
.loc 16 12 0
$LDWbegin_reduce_kernel_stage1:
.loc 16 18 0
ld.param.u32 %r1, [__cudaparm_reduce_kernel_stage1_seq_count];
cvt.u32.u16 %r2, %ctaid.x;
cvt.u32.u16 %r3, %tid.x;
mul.lo.u32 %r4, %r1, %r2;
mul.lo.u32 %r5, %r4, 512;
add.u32 %r6, %r3, %r5;
mov.u32 %r7, 0;
setp.eq.u32 %p1, %r1, %r7;
@%p1 bra $Lt_0_13826;
ld.param.u32 %r1, [__cudaparm_reduce_kernel_stage1_seq_count];
mov.s32 %r8, %r1;
ld.param.u32 %r9, [__cudaparm_reduce_kernel_stage1_n];
mov.u32 %r10, 0;
mov.s64 %rd1, 0;
mov.s32 %r11, %r8;
$Lt_0_10754:
//<loop> Loop body line 18, nesting depth: 1, estimated iterations: unknown
ld.param.u32 %r9, [__cudaparm_reduce_kernel_stage1_n];
.loc 16 24 0
setp.ge.u32 %p2, %r6, %r9;
@%p2 bra $Lt_0_10242;
//<loop> Part of loop body line 18, head labeled $Lt_0_10754
.loc 16 25 0
cvt.u64.u32 %rd2, %r6;
mul.wide.u32 %rd3, %r6, 8;
ld.param.u64 %rd4, [__cudaparm_reduce_kernel_stage1_a];
add.u64 %rd5, %rd4, %rd3;
ld.global.s64 %rd6, [%rd5+0];
ld.param.u64 %rd7, [__cudaparm_reduce_kernel_stage1_b];
add.u64 %rd8, %rd7, %rd3;
ld.global.s64 %rd9, [%rd8+0];
mul.lo.s64 %rd10, %rd6, %rd9;
add.s64 %rd1, %rd1, %rd10;
.loc 16 27 0
add.u32 %r6, %r6, 512;
.loc 16 21 0
add.u32 %r10, %r10, 1;
.loc 16 18 0
ld.param.u32 %r1, [__cudaparm_reduce_kernel_stage1_seq_count];
.loc 16 21 0
setp.ne.u32 %p3, %r10, %r1;
@%p3 bra $Lt_0_10754;
bra.uni $Lt_0_10242;
$Lt_0_13826:
mov.s64 %rd1, 0;
$Lt_0_10242:
$Lt_0_258:
.loc 16 30 0
mov.u64 %rd11, __cuda___cuda_local_var_20967_44_sdata32;
cvt.u64.u32 %rd12, %r3;
mul.wide.u32 %rd13, %r3, 8;
add.u64 %rd14, %rd11, %rd13;
st.shared.s64 [%rd14+0], %rd1;
.loc 16 32 0
bar.sync 0;
mov.u32 %r12, 255;
setp.gt.u32 %p4, %r3, %r12;
@%p4 bra $Lt_0_11266;
.loc 16 35 0
ld.shared.s64 %rd15, [%rd14+0];
ld.shared.s64 %rd16, [%rd14+2048];
add.s64 %rd17, %rd15, %rd16;
st.shared.s64 [%rd14+0], %rd17;
$Lt_0_11266:
.loc 16 36 0
bar.sync 0;
mov.u32 %r13, 127;
setp.gt.u32 %p5, %r3, %r13;
@%p5 bra $Lt_0_11778;
.loc 16 40 0
ld.shared.s64 %rd18, [%rd14+0];
ld.shared.s64 %rd19, [%rd14+1024];
add.s64 %rd20, %rd18, %rd19;
st.shared.s64 [%rd14+0], %rd20;
$Lt_0_11778:
.loc 16 41 0
bar.sync 0;
mov.u32 %r14, 63;
setp.gt.u32 %p6, %r3, %r14;
@%p6 bra $Lt_0_12290;
.loc 16 45 0
ld.shared.s64 %rd21, [%rd14+0];
ld.shared.s64 %rd22, [%rd14+512];
add.s64 %rd23, %rd21, %rd22;
st.shared.s64 [%rd14+0], %rd23;
$Lt_0_12290:
.loc 16 46 0
bar.sync 0;
mov.u32 %r15, 31;
setp.gt.u32 %p7, %r3, %r15;
@%p7 bra $Lt_0_12802;
.loc 16 51 0
ld.shared.s64 %rd24, [%rd14+0];
ld.shared.s64 %rd25, [%rd14+256];
add.s64 %rd26, %rd24, %rd25;
st.shared.s64 [%rd14+0], %rd26;
.loc 16 52 0
ld.shared.s64 %rd27, [%rd14+128];
add.s64 %rd28, %rd27, %rd26;
st.shared.s64 [%rd14+0], %rd28;
.loc 16 53 0
ld.shared.s64 %rd29, [%rd14+64];
add.s64 %rd30, %rd29, %rd28;
st.shared.s64 [%rd14+0], %rd30;
.loc 16 54 0
ld.shared.s64 %rd31, [%rd14+32];
add.s64 %rd32, %rd31, %rd30;
st.shared.s64 [%rd14+0], %rd32;
.loc 16 55 0
ld.shared.s64 %rd33, [%rd14+16];
add.s64 %rd34, %rd33, %rd32;
st.shared.s64 [%rd14+0], %rd34;
.loc 16 56 0
ld.shared.s64 %rd35, [%rd14+8];
add.s64 %rd36, %rd35, %rd34;
st.shared.s64 [%rd14+0], %rd36;
$Lt_0_12802:
mov.u32 %r16, 0;
setp.ne.u32 %p8, %r3, %r16;
@%p8 bra $Lt_0_13314;
.loc 16 59 0
ld.shared.s64 %rd37, [__cuda___cuda_local_var_20967_44_sdata32+0];
ld.param.u64 %rd38, [__cudaparm_reduce_kernel_stage1_out];
cvt.u64.u32 %rd39, %r2;
mul.wide.u32 %rd40, %r2, 8;
add.u64 %rd41, %rd38, %rd40;
st.global.s64 [%rd41+0], %rd37;
$Lt_0_13314:
.loc 16 60 0
exit;
$LDWend_reduce_kernel_stage1:
} // reduce_kernel_stage1
.version 2.1
.target sm_20
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 3.1 built on 2010-06-07
//-----------------------------------------------------------
// Compiling kernel.cpp3.i (/tmp/ccBI#.g8WbAf)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/bin/../include/host_defines.h"
.file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 7 "/usr/local/cuda/bin/../include/device_types.h"
.file 8 "/usr/local/cuda/bin/../include/driver_types.h"
.file 9 "/usr/local/cuda/bin/../include/surface_types.h"
.file 10 "/usr/local/cuda/bin/../include/texture_types.h"
.file 11 "/usr/local/cuda/bin/../include/vector_types.h"
.file 12
"/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "kernel.cu"
.file 17
"/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 18 "/usr/local/cuda/bin/../include/common_functions.h"
.file 19 "/usr/local/cuda/bin/../include/math_functions.h"
.file 20 "/usr/local/cuda/bin/../include/math_constants.h"
.file 21 "/usr/local/cuda/bin/../include/device_functions.h"
.file 22
"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 23
"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 24
"/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 25
"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
.file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
.file 27 "/usr/local/cuda/bin/../include/surface_functions.h"
.file 28
"/usr/local/cuda/bin/../include/math_functions_dbl_ptx3.h"
.entry reduce_kernel_stage1 (
.param .u64 __cudaparm_reduce_kernel_stage1_out,
.param .u64 __cudaparm_reduce_kernel_stage1_a,
.param .u64 __cudaparm_reduce_kernel_stage1_b,
.param .u32 __cudaparm_reduce_kernel_stage1_seq_count,
.param .u32 __cudaparm_reduce_kernel_stage1_n)
{
.reg .u32 %r<18>;
.reg .u64 %rd<43>;
.reg .pred %p<10>;
.shared .align 8 .b8 __cuda___cuda_local_var_23980_44_sdata32[4096];
.loc 16 12 0
$LDWbegin_reduce_kernel_stage1:
.loc 16 18 0
ld.param.u32 %r1, [__cudaparm_reduce_kernel_stage1_seq_count];
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mul.lo.u32 %r4, %r1, %r2;
mul.lo.u32 %r5, %r4, 512;
add.u32 %r6, %r3, %r5;
mov.u32 %r7, 0;
setp.eq.u32 %p1, %r1, %r7;
@%p1 bra $Lt_0_13826;
mov.s32 %r8, %r1;
ld.param.u32 %r9, [__cudaparm_reduce_kernel_stage1_n];
mov.u32 %r10, 0;
mov.s64 %rd1, 0;
mov.s32 %r11, %r8;
$Lt_0_10754:
//<loop> Loop body line 18, nesting depth: 1, estimated iterations: unknown
.loc 16 24 0
setp.ge.u32 %p2, %r6, %r9;
@%p2 bra $Lt_0_10242;
//<loop> Part of loop body line 18, head labeled $Lt_0_10754
.loc 16 25 0
cvt.u64.u32 %rd2, %r6;
mul.wide.u32 %rd3, %r6, 8;
ld.param.u64 %rd4, [__cudaparm_reduce_kernel_stage1_a];
add.u64 %rd5, %rd4, %rd3;
ld.global.s64 %rd6, [%rd5+0];
ld.param.u64 %rd7, [__cudaparm_reduce_kernel_stage1_b];
add.u64 %rd8, %rd7, %rd3;
ld.global.s64 %rd9, [%rd8+0];
mul.lo.s64 %rd10, %rd6, %rd9;
add.s64 %rd1, %rd1, %rd10;
.loc 16 27 0
add.u32 %r6, %r6, 512;
.loc 16 21 0
add.u32 %r10, %r10, 1;
setp.ne.u32 %p3, %r10, %r1;
@%p3 bra $Lt_0_10754;
bra.uni $Lt_0_10242;
$Lt_0_13826:
mov.s64 %rd1, 0;
$Lt_0_10242:
$Lt_0_258:
.loc 16 30 0
mov.u64 %rd11, __cuda___cuda_local_var_23980_44_sdata32;
cvt.u64.u32 %rd12, %r3;
mul.wide.u32 %rd13, %r3, 8;
add.u64 %rd14, %rd11, %rd13;
st.shared.s64 [%rd14+0], %rd1;
.loc 16 32 0
bar.sync 0;
mov.u32 %r12, 255;
setp.gt.u32 %p4, %r3, %r12;
@%p4 bra $Lt_0_11266;
.loc 16 35 0
ld.shared.s64 %rd15, [%rd14+0];
ld.shared.s64 %rd16, [%rd14+2048];
add.s64 %rd17, %rd15, %rd16;
st.shared.s64 [%rd14+0], %rd17;
$Lt_0_11266:
.loc 16 36 0
bar.sync 0;
mov.u32 %r13, 127;
setp.gt.u32 %p5, %r3, %r13;
@%p5 bra $Lt_0_11778;
.loc 16 40 0
ld.shared.s64 %rd18, [%rd14+0];
ld.shared.s64 %rd19, [%rd14+1024];
add.s64 %rd20, %rd18, %rd19;
st.shared.s64 [%rd14+0], %rd20;
$Lt_0_11778:
.loc 16 41 0
bar.sync 0;
mov.u32 %r14, 63;
setp.gt.u32 %p6, %r3, %r14;
@%p6 bra $Lt_0_12290;
.loc 16 45 0
ld.shared.s64 %rd21, [%rd14+0];
ld.shared.s64 %rd22, [%rd14+512];
add.s64 %rd23, %rd21, %rd22;
st.shared.s64 [%rd14+0], %rd23;
$Lt_0_12290:
.loc 16 46 0
bar.sync 0;
mov.u32 %r15, 31;
setp.gt.u32 %p7, %r3, %r15;
@%p7 bra $Lt_0_12802;
.loc 16 51 0
ld.shared.s64 %rd24, [%rd14+0];
ld.shared.s64 %rd25, [%rd14+256];
add.s64 %rd26, %rd24, %rd25;
st.shared.s64 [%rd14+0], %rd26;
.loc 16 52 0
ld.shared.s64 %rd27, [%rd14+128];
add.s64 %rd28, %rd27, %rd26;
st.shared.s64 [%rd14+0], %rd28;
.loc 16 53 0
ld.shared.s64 %rd29, [%rd14+64];
add.s64 %rd30, %rd29, %rd28;
st.shared.s64 [%rd14+0], %rd30;
.loc 16 54 0
ld.shared.s64 %rd31, [%rd14+32];
add.s64 %rd32, %rd31, %rd30;
st.shared.s64 [%rd14+0], %rd32;
.loc 16 55 0
ld.shared.s64 %rd33, [%rd14+16];
add.s64 %rd34, %rd33, %rd32;
st.shared.s64 [%rd14+0], %rd34;
.loc 16 56 0
ld.shared.s64 %rd35, [%rd14+8];
add.s64 %rd36, %rd35, %rd34;
st.shared.s64 [%rd14+0], %rd36;
$Lt_0_12802:
mov.u32 %r16, 0;
setp.ne.u32 %p8, %r3, %r16;
@%p8 bra $Lt_0_13314;
.loc 16 59 0
ld.shared.s64 %rd37, [__cuda___cuda_local_var_23980_44_sdata32+0];
ld.param.u64 %rd38, [__cudaparm_reduce_kernel_stage1_out];
cvt.u64.u32 %rd39, %r2;
mul.wide.u32 %rd40, %r2, 8;
add.u64 %rd41, %rd38, %rd40;
st.global.s64 [%rd41+0], %rd37;
$Lt_0_13314:
.loc 16 60 0
exit;
$LDWend_reduce_kernel_stage1:
} // reduce_kernel_stage1
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
