On Dienstag 09 Februar 2010, Bogdan Opanchuk wrote:
> Hello,
> 
> Yet another stupid question. Most probably, I missed something
> obvious, but anyway - can someone explain why I get some NaN's in
> output for the program (listed below)? Surprisingly, bug disappears if
> I send '1' instead of '-1' as a third parameter to function (or remove
> 'int' parameters completely and leave only two pointers). Same kernel
> in pure Cuda works fine. Looks like memory corruption, but I can't
> figure out where it happens...

This looks like a compiler bug to me. I've attached the PTX that the 3.0
compiler generates--apparently all your loops get unrolled, and then
something gets confused, though I wasn't able to track down what
exactly.

Couple more data points:
- Even in the first case (that you report as being ok), I get floating
  point garbage in the first 32 entries of b_gpu.
- Adding an index bounds check to the second for loop also appears to
  fix things.

Have you reported this to Nvidia? (If not, you should.)

Andreas

PS: Sorry for the long absence everybody. I was at a workshop, and then
had lots to do on my return. Plus I have a thesis coming up, so please
bear with me. :)

        .version 1.4
        .target sm_13
        // compiled with /home/andreas/pool/cuda-3.0/open64/lib//be
        // nvopencc 3.0 built on 2009-10-26

        //-----------------------------------------------------------
        // Compiling kernel.cpp3.i (/tmp/ccBI#.vZIo77)
        //-----------------------------------------------------------

        //-----------------------------------------------------------
        // Options:
        //-----------------------------------------------------------
        //  Target:ptx, ISA:sm_13, Endian:little, Pointer Size:64
        //  -O3 (Optimization level)
        //  -g0 (Debug level)
        //  -m2 (Report advisories)
        //-----------------------------------------------------------

        .file   1       "<command-line>"
        .file   2       "kernel.cudafe2.gpu"
        .file   3       "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h"
        .file   4       
"/home/andreas/pool/cuda/bin/../include/crt/device_runtime.h"
        .file   5       "/home/andreas/pool/cuda/bin/../include/host_defines.h"
        .file   6       "/home/andreas/pool/cuda/bin/../include/builtin_types.h"
        .file   7       "/home/andreas/pool/cuda/bin/../include/device_types.h"
        .file   8       "/home/andreas/pool/cuda/bin/../include/driver_types.h"
        .file   9       "/home/andreas/pool/cuda/bin/../include/surface_types.h"
        .file   10      "/home/andreas/pool/cuda/bin/../include/texture_types.h"
        .file   11      "/home/andreas/pool/cuda/bin/../include/vector_types.h"
        .file   12      
"/home/andreas/pool/cuda/bin/../include/device_launch_parameters.h"
        .file   13      
"/home/andreas/pool/cuda/bin/../include/crt/storage_class.h"
        .file   14      "/usr/include/bits/types.h"
        .file   15      "/usr/include/time.h"
        .file   16      
"/home/andreas/pool/cuda/bin/../include/texture_fetch_functions.h"
        .file   17      
"/home/andreas/pool/cuda/bin/../include/common_functions.h"
        .file   18      
"/home/andreas/pool/cuda/bin/../include/crt/func_macro.h"
        .file   19      
"/home/andreas/pool/cuda/bin/../include/math_functions.h"
        .file   20      
"/home/andreas/pool/cuda/bin/../include/device_functions.h"
        .file   21      
"/home/andreas/pool/cuda/bin/../include/math_constants.h"
        .file   22      
"/home/andreas/pool/cuda/bin/../include/sm_11_atomic_functions.h"
        .file   23      
"/home/andreas/pool/cuda/bin/../include/sm_12_atomic_functions.h"
        .file   24      
"/home/andreas/pool/cuda/bin/../include/sm_13_double_functions.h"
        .file   25      
"/home/andreas/pool/cuda/bin/../include/sm_20_atomic_functions.h"
        .file   26      
"/home/andreas/pool/cuda/bin/../include/sm_20_intrinsics.h"
        .file   27      
"/home/andreas/pool/cuda/bin/../include/surface_functions.h"
        .file   28      
"/home/andreas/pool/cuda/bin/../include/math_functions_dbl_ptx3.h"
        .file   29      "kernel.cu"


        .entry test (
                .param .u64 __cudaparm_test_in,
                .param .u64 __cudaparm_test_out,
                .param .s32 __cudaparm_test_dir,
                .param .s32 __cudaparm_test_S)
        {
        .reg .u32 %r<17>;
        .reg .u64 %rd<8>;
        .reg .f32 %f<81>;
        .shared .align 4 .b8 __cuda_sMem24[8192];
        .loc    29      3       0
$LBB1_test:
        .loc    29      8       0
        mov.f32         %f1, 0f00000000;        // 0
        mov.f32         %f2, %f1;
        mov.f32         %f3, 0f00000000;        // 0
        mov.f32         %f4, %f3;
        mov.f32         %f5, 0f00000000;        // 0
        mov.f32         %f6, %f5;
        mov.f32         %f7, 0f00000000;        // 0
        mov.f32         %f8, %f7;
        mov.f32         %f9, 0f00000000;        // 0
        mov.f32         %f10, %f9;
        mov.f32         %f11, 0f00000000;       // 0
        mov.f32         %f12, %f11;
        mov.f32         %f13, 0f00000000;       // 0
        mov.f32         %f14, %f13;
        mov.f32         %f15, 0f00000000;       // 0
        mov.f32         %f16, %f15;
        mov.f32         %f17, 0f00000000;       // 0
        mov.f32         %f18, %f17;
        mov.f32         %f19, 0f00000000;       // 0
        mov.f32         %f20, %f19;
        mov.f32         %f21, 0f00000000;       // 0
        mov.f32         %f22, %f21;
        mov.f32         %f23, 0f00000000;       // 0
        mov.f32         %f24, %f23;
        mov.f32         %f25, 0f00000000;       // 0
        mov.f32         %f26, %f25;
        mov.f32         %f27, 0f00000000;       // 0
        mov.f32         %f28, %f27;
        mov.f32         %f29, 0f00000000;       // 0
        mov.f32         %f30, %f29;
        mov.f32         %f31, 0f00000000;       // 0
        mov.f32         %f32, %f31;
        .loc    29      17      0
        cvt.s32.u16     %r1, %tid.x;
        shr.s32         %r2, %r1, 31;
        mov.s32         %r3, 15;
        and.b32         %r4, %r2, %r3;
        add.s32         %r5, %r4, %r1;
        shr.s32         %r6, %r5, 4;
        mul.lo.s32      %r7, %r6, 16;
        sub.s32         %r8, %r1, %r7;
        shr.s32         %r9, %r1, 31;
        mov.s32         %r10, 15;
        and.b32         %r11, %r9, %r10;
        add.s32         %r12, %r11, %r1;
        shr.s32         %r13, %r12, 4;
        mul.lo.s32      %r14, %r8, 128;
        add.s32         %r15, %r13, %r14;
        cvt.s64.s32     %rd1, %r15;
        mul.lo.u64      %rd2, %rd1, 4;
        mov.u64         %rd3, __cuda_sMem24;
        add.u64         %rd4, %rd2, %rd3;
        mov.f32         %f33, 0f00000000;       // 0
        st.shared.f32   [%rd4+0], %f33;
        mov.f32         %f34, 0f00000000;       // 0
        st.shared.f32   [%rd4+32], %f34;
        mov.f32         %f35, 0f00000000;       // 0
        st.shared.f32   [%rd4+64], %f35;
        mov.f32         %f36, 0f00000000;       // 0
        st.shared.f32   [%rd4+96], %f36;
        mov.f32         %f37, 0f00000000;       // 0
        st.shared.f32   [%rd4+128], %f37;
        mov.f32         %f38, 0f00000000;       // 0
        st.shared.f32   [%rd4+160], %f38;
        mov.f32         %f39, 0f00000000;       // 0
        st.shared.f32   [%rd4+192], %f39;
        mov.f32         %f40, 0f00000000;       // 0
        st.shared.f32   [%rd4+224], %f40;
        mov.f32         %f41, 0f00000000;       // 0
        st.shared.f32   [%rd4+256], %f41;
        mov.f32         %f42, 0f00000000;       // 0
        st.shared.f32   [%rd4+288], %f42;
        mov.f32         %f43, 0f00000000;       // 0
        st.shared.f32   [%rd4+320], %f43;
        mov.f32         %f44, 0f00000000;       // 0
        st.shared.f32   [%rd4+352], %f44;
        mov.f32         %f45, 0f00000000;       // 0
        st.shared.f32   [%rd4+384], %f45;
        mov.f32         %f46, 0f00000000;       // 0
        st.shared.f32   [%rd4+416], %f46;
        mov.f32         %f47, 0f00000000;       // 0
        st.shared.f32   [%rd4+448], %f47;
        mov.f32         %f48, 0f00000000;       // 0
        st.shared.f32   [%rd4+480], %f48;
        .loc    29      18      0
        bar.sync        0;
        .loc    29      21      0
        ld.shared.f32   %f49, [%rd4+0];
        mov.f32         %f2, %f49;
        ld.shared.f32   %f50, [%rd4+32];
        mov.f32         %f4, %f50;
        ld.shared.f32   %f51, [%rd4+64];
        mov.f32         %f6, %f51;
        ld.shared.f32   %f52, [%rd4+96];
        mov.f32         %f8, %f52;
        ld.shared.f32   %f53, [%rd4+128];
        mov.f32         %f10, %f53;
        ld.shared.f32   %f54, [%rd4+160];
        mov.f32         %f12, %f54;
        ld.shared.f32   %f55, [%rd4+192];
        mov.f32         %f14, %f55;
        ld.shared.f32   %f56, [%rd4+224];
        mov.f32         %f16, %f56;
        ld.shared.f32   %f57, [%rd4+256];
        mov.f32         %f18, %f57;
        ld.shared.f32   %f58, [%rd4+288];
        mov.f32         %f20, %f58;
        ld.shared.f32   %f59, [%rd4+320];
        mov.f32         %f22, %f59;
        ld.shared.f32   %f60, [%rd4+352];
        mov.f32         %f24, %f60;
        ld.shared.f32   %f61, [%rd4+384];
        mov.f32         %f26, %f61;
        ld.shared.f32   %f62, [%rd4+416];
        mov.f32         %f28, %f62;
        ld.shared.f32   %f63, [%rd4+448];
        mov.f32         %f30, %f63;
        ld.shared.f32   %f64, [%rd4+480];
        mov.f32         %f32, %f64;
        .loc    29      22      0
        bar.sync        0;
        .loc    29      25      0
        ld.param.u64    %rd5, [__cudaparm_test_out];
        add.u64         %rd6, %rd5, %rd2;
        mov.f32         %f65, %f2;
        st.global.f32   [%rd6+0], %f65;
        mov.f32         %f66, %f4;
        st.global.f32   [%rd6+32], %f66;
        mov.f32         %f67, %f6;
        st.global.f32   [%rd6+64], %f67;
        mov.f32         %f68, %f8;
        st.global.f32   [%rd6+96], %f68;
        mov.f32         %f69, %f10;
        st.global.f32   [%rd6+128], %f69;
        mov.f32         %f70, %f12;
        st.global.f32   [%rd6+160], %f70;
        mov.f32         %f71, %f14;
        st.global.f32   [%rd6+192], %f71;
        mov.f32         %f72, %f16;
        st.global.f32   [%rd6+224], %f72;
        mov.f32         %f73, %f18;
        st.global.f32   [%rd6+256], %f73;
        mov.f32         %f74, %f20;
        st.global.f32   [%rd6+288], %f74;
        mov.f32         %f75, %f22;
        st.global.f32   [%rd6+320], %f75;
        mov.f32         %f76, %f24;
        st.global.f32   [%rd6+352], %f76;
        mov.f32         %f77, %f26;
        st.global.f32   [%rd6+384], %f77;
        mov.f32         %f78, %f28;
        st.global.f32   [%rd6+416], %f78;
        mov.f32         %f79, %f30;
        st.global.f32   [%rd6+448], %f79;
        st.global.f32   [%rd6+480], %f64;
        .loc    29      26      0
        exit;
$LDWend_test:
        } // test

Attachment: signature.asc
Description: This is a digitally signed message part.

_______________________________________________
PyCUDA mailing list
pyc...@host304.hostmonster.com
http://host304.hostmonster.com/mailman/listinfo/pycuda_tiker.net

Reply via email to