On Dienstag 09 Februar 2010, Bogdan Opanchuk wrote: > Hello, > > Yet another stupid question. Most probably, I missed something > obvious, but anyway - can someone explain why I get some NaN's in > output for the program (listed below)? Surprisingly, bug disappears if > I send '1' instead of '-1' as a third parameter to function (or remove > 'int' parameters completely and leave only two pointers). Same kernel > in pure Cuda works fine. Looks like memory corruption, but I can't > figure out where it happens...
This looks like a compiler bug to me. I've attached the PTX that the 3.0 compiler generates--apparently all your loops get unrolled, and then something gets confused, though I wasn't able to track down what exactly. Couple more data points: - Even in the first case (that you report as being ok), I get floating point garbage in the first 32 entries of b_gpu. - Adding an index bounds check to the second for loop also appears to fix things. Have you reported this to Nvidia? (If not, you should.) Andreas PS: Sorry for the long absence everybody. I was at a workshop, and then had lots to do on my return. Plus I have a thesis coming up, so please bear with me. :)
.version 1.4 .target sm_13 // compiled with /home/andreas/pool/cuda-3.0/open64/lib//be // nvopencc 3.0 built on 2009-10-26 //----------------------------------------------------------- // Compiling kernel.cpp3.i (/tmp/ccBI#.vZIo77) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:64 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "<command-line>" .file 2 "kernel.cudafe2.gpu" .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h" .file 4 "/home/andreas/pool/cuda/bin/../include/crt/device_runtime.h" .file 5 "/home/andreas/pool/cuda/bin/../include/host_defines.h" .file 6 "/home/andreas/pool/cuda/bin/../include/builtin_types.h" .file 7 "/home/andreas/pool/cuda/bin/../include/device_types.h" .file 8 "/home/andreas/pool/cuda/bin/../include/driver_types.h" .file 9 "/home/andreas/pool/cuda/bin/../include/surface_types.h" .file 10 "/home/andreas/pool/cuda/bin/../include/texture_types.h" .file 11 "/home/andreas/pool/cuda/bin/../include/vector_types.h" .file 12 "/home/andreas/pool/cuda/bin/../include/device_launch_parameters.h" .file 13 "/home/andreas/pool/cuda/bin/../include/crt/storage_class.h" .file 14 "/usr/include/bits/types.h" .file 15 "/usr/include/time.h" .file 16 "/home/andreas/pool/cuda/bin/../include/texture_fetch_functions.h" .file 17 "/home/andreas/pool/cuda/bin/../include/common_functions.h" .file 18 "/home/andreas/pool/cuda/bin/../include/crt/func_macro.h" .file 19 "/home/andreas/pool/cuda/bin/../include/math_functions.h" .file 20 "/home/andreas/pool/cuda/bin/../include/device_functions.h" .file 21 "/home/andreas/pool/cuda/bin/../include/math_constants.h" .file 22 "/home/andreas/pool/cuda/bin/../include/sm_11_atomic_functions.h" .file 23 "/home/andreas/pool/cuda/bin/../include/sm_12_atomic_functions.h" .file 24 "/home/andreas/pool/cuda/bin/../include/sm_13_double_functions.h" .file 25 "/home/andreas/pool/cuda/bin/../include/sm_20_atomic_functions.h" .file 26 "/home/andreas/pool/cuda/bin/../include/sm_20_intrinsics.h" .file 27 "/home/andreas/pool/cuda/bin/../include/surface_functions.h" .file 28 "/home/andreas/pool/cuda/bin/../include/math_functions_dbl_ptx3.h" .file 29 "kernel.cu" .entry test ( .param .u64 __cudaparm_test_in, .param .u64 __cudaparm_test_out, .param .s32 __cudaparm_test_dir, .param .s32 __cudaparm_test_S) { .reg .u32 %r<17>; .reg .u64 %rd<8>; .reg .f32 %f<81>; .shared .align 4 .b8 __cuda_sMem24[8192]; .loc 29 3 0 $LBB1_test: .loc 29 8 0 mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, %f1; mov.f32 %f3, 0f00000000; // 0 mov.f32 %f4, %f3; mov.f32 %f5, 0f00000000; // 0 mov.f32 %f6, %f5; mov.f32 %f7, 0f00000000; // 0 mov.f32 %f8, %f7; mov.f32 %f9, 0f00000000; // 0 mov.f32 %f10, %f9; mov.f32 %f11, 0f00000000; // 0 mov.f32 %f12, %f11; mov.f32 %f13, 0f00000000; // 0 mov.f32 %f14, %f13; mov.f32 %f15, 0f00000000; // 0 mov.f32 %f16, %f15; mov.f32 %f17, 0f00000000; // 0 mov.f32 %f18, %f17; mov.f32 %f19, 0f00000000; // 0 mov.f32 %f20, %f19; mov.f32 %f21, 0f00000000; // 0 mov.f32 %f22, %f21; mov.f32 %f23, 0f00000000; // 0 mov.f32 %f24, %f23; mov.f32 %f25, 0f00000000; // 0 mov.f32 %f26, %f25; mov.f32 %f27, 0f00000000; // 0 mov.f32 %f28, %f27; mov.f32 %f29, 0f00000000; // 0 mov.f32 %f30, %f29; mov.f32 %f31, 0f00000000; // 0 mov.f32 %f32, %f31; .loc 29 17 0 cvt.s32.u16 %r1, %tid.x; shr.s32 %r2, %r1, 31; mov.s32 %r3, 15; and.b32 %r4, %r2, %r3; add.s32 %r5, %r4, %r1; shr.s32 %r6, %r5, 4; mul.lo.s32 %r7, %r6, 16; sub.s32 %r8, %r1, %r7; shr.s32 %r9, %r1, 31; mov.s32 %r10, 15; and.b32 %r11, %r9, %r10; add.s32 %r12, %r11, %r1; shr.s32 %r13, %r12, 4; mul.lo.s32 %r14, %r8, 128; add.s32 %r15, %r13, %r14; cvt.s64.s32 %rd1, %r15; mul.lo.u64 %rd2, %rd1, 4; mov.u64 %rd3, __cuda_sMem24; add.u64 %rd4, %rd2, %rd3; mov.f32 %f33, 0f00000000; // 0 st.shared.f32 [%rd4+0], %f33; mov.f32 %f34, 0f00000000; // 0 st.shared.f32 [%rd4+32], %f34; mov.f32 %f35, 0f00000000; // 0 st.shared.f32 [%rd4+64], %f35; mov.f32 %f36, 0f00000000; // 0 st.shared.f32 [%rd4+96], %f36; mov.f32 %f37, 0f00000000; // 0 st.shared.f32 [%rd4+128], %f37; mov.f32 %f38, 0f00000000; // 0 st.shared.f32 [%rd4+160], %f38; mov.f32 %f39, 0f00000000; // 0 st.shared.f32 [%rd4+192], %f39; mov.f32 %f40, 0f00000000; // 0 st.shared.f32 [%rd4+224], %f40; mov.f32 %f41, 0f00000000; // 0 st.shared.f32 [%rd4+256], %f41; mov.f32 %f42, 0f00000000; // 0 st.shared.f32 [%rd4+288], %f42; mov.f32 %f43, 0f00000000; // 0 st.shared.f32 [%rd4+320], %f43; mov.f32 %f44, 0f00000000; // 0 st.shared.f32 [%rd4+352], %f44; mov.f32 %f45, 0f00000000; // 0 st.shared.f32 [%rd4+384], %f45; mov.f32 %f46, 0f00000000; // 0 st.shared.f32 [%rd4+416], %f46; mov.f32 %f47, 0f00000000; // 0 st.shared.f32 [%rd4+448], %f47; mov.f32 %f48, 0f00000000; // 0 st.shared.f32 [%rd4+480], %f48; .loc 29 18 0 bar.sync 0; .loc 29 21 0 ld.shared.f32 %f49, [%rd4+0]; mov.f32 %f2, %f49; ld.shared.f32 %f50, [%rd4+32]; mov.f32 %f4, %f50; ld.shared.f32 %f51, [%rd4+64]; mov.f32 %f6, %f51; ld.shared.f32 %f52, [%rd4+96]; mov.f32 %f8, %f52; ld.shared.f32 %f53, [%rd4+128]; mov.f32 %f10, %f53; ld.shared.f32 %f54, [%rd4+160]; mov.f32 %f12, %f54; ld.shared.f32 %f55, [%rd4+192]; mov.f32 %f14, %f55; ld.shared.f32 %f56, [%rd4+224]; mov.f32 %f16, %f56; ld.shared.f32 %f57, [%rd4+256]; mov.f32 %f18, %f57; ld.shared.f32 %f58, [%rd4+288]; mov.f32 %f20, %f58; ld.shared.f32 %f59, [%rd4+320]; mov.f32 %f22, %f59; ld.shared.f32 %f60, [%rd4+352]; mov.f32 %f24, %f60; ld.shared.f32 %f61, [%rd4+384]; mov.f32 %f26, %f61; ld.shared.f32 %f62, [%rd4+416]; mov.f32 %f28, %f62; ld.shared.f32 %f63, [%rd4+448]; mov.f32 %f30, %f63; ld.shared.f32 %f64, [%rd4+480]; mov.f32 %f32, %f64; .loc 29 22 0 bar.sync 0; .loc 29 25 0 ld.param.u64 %rd5, [__cudaparm_test_out]; add.u64 %rd6, %rd5, %rd2; mov.f32 %f65, %f2; st.global.f32 [%rd6+0], %f65; mov.f32 %f66, %f4; st.global.f32 [%rd6+32], %f66; mov.f32 %f67, %f6; st.global.f32 [%rd6+64], %f67; mov.f32 %f68, %f8; st.global.f32 [%rd6+96], %f68; mov.f32 %f69, %f10; st.global.f32 [%rd6+128], %f69; mov.f32 %f70, %f12; st.global.f32 [%rd6+160], %f70; mov.f32 %f71, %f14; st.global.f32 [%rd6+192], %f71; mov.f32 %f72, %f16; st.global.f32 [%rd6+224], %f72; mov.f32 %f73, %f18; st.global.f32 [%rd6+256], %f73; mov.f32 %f74, %f20; st.global.f32 [%rd6+288], %f74; mov.f32 %f75, %f22; st.global.f32 [%rd6+320], %f75; mov.f32 %f76, %f24; st.global.f32 [%rd6+352], %f76; mov.f32 %f77, %f26; st.global.f32 [%rd6+384], %f77; mov.f32 %f78, %f28; st.global.f32 [%rd6+416], %f78; mov.f32 %f79, %f30; st.global.f32 [%rd6+448], %f79; st.global.f32 [%rd6+480], %f64; .loc 29 26 0 exit; $LDWend_test: } // test
signature.asc
Description: This is a digitally signed message part.
_______________________________________________ PyCUDA mailing list pyc...@host304.hostmonster.com http://host304.hostmonster.com/mailman/listinfo/pycuda_tiker.net