Re: Improving code generation in the nvptx back end
On Fri, 17 Feb 2017, Thomas Schwinge wrote: > On Fri, 17 Feb 2017 14:00:09 +0100, I wrote: > > [...] for "normal" functions there is no reason to use the > > ".param" space for passing arguments in and out of functions. We can > > then get rid of the boilerplate code to move ".param %in_ar*" into ".reg > > %ar*", and the other way round for "%value_out"/"%value". This will then > > also simplify the call sites, where all that code "evaporates". That's > > actually something I started to look into, many months ago, and I now > > just dug out those changes, and will post them later. > > > > (Very likely, the PTX "JIT" compiler will do the very same thing without > > difficulty, but why not directly generate code that is less verbose to > > read?) In general you cannot use this shorthand notation because PTX interop guidelines explicitly prescribe using the .param space for argument passing. See https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/ , section 3. So at best GCC can use it for calls where interop concerns are guaranteed to not arise: when the callee is not externally visible, and does not have its address taken. And there's a question of how well it's going to work after time passes, since other compilers always use the verbose form (and thus the .reg calling style is not frequently exercised). Alexander
Re: Improving code generation in the nvptx back end
On 02/17/2017 02:09 PM, Thomas Schwinge wrote: Hi! On Fri, 17 Feb 2017 14:00:09 +0100, I wrote: [...] for "normal" functions there is no reason to use the ".param" space for passing arguments in and out of functions. We can then get rid of the boilerplate code to move ".param %in_ar*" into ".reg %ar*", and the other way round for "%value_out"/"%value". This will then also simplify the call sites, where all that code "evaporates". That's actually something I started to look into, many months ago, and I now just dug out those changes, and will post them later. (Very likely, the PTX "JIT" compiler will do the very same thing without difficulty, but why not directly generate code that is less verbose to read?) Using my WIP patch, the generated PTX code changes/is simplified as follows: It's probably a good idea to run cuobjdump -sass to see whether this has any effect at all. The most important issue that needs solving is probably still the old issue that ptxas isn't reliable. Looks like the llvm folks ran into the same problem, as I discovered last week: https://bugs.llvm.org//show_bug.cgi?id=27738 Bernd
Re: Re: Improving code generation in the nvptx back end
On 02/17/2017 05:09 AM, Thomas Schwinge wrote: > On Fri, 17 Feb 2017 14:00:09 +0100, I wrote: >> [...] for "normal" functions there is no reason to use the >> ".param" space for passing arguments in and out of functions. We can >> then get rid of the boilerplate code to move ".param %in_ar*" into ".reg >> %ar*", and the other way round for "%value_out"/"%value". This will then >> also simplify the call sites, where all that code "evaporates". That's >> actually something I started to look into, many months ago, and I now >> just dug out those changes, and will post them later. >> >> (Very likely, the PTX "JIT" compiler will do the very same thing without >> difficulty, but why not directly generate code that is less verbose to >> read?) > > Using my WIP patch, the generated PTX code changes/is simplified as > follows: > > // BEGIN GLOBAL FUNCTION DECL: f > -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 > %in_ar1); > +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1); > > // BEGIN GLOBAL FUNCTION DEF: f > -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 > %in_ar1) > +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1) > { > .reg.f32 %value; > - .reg.u32 %ar0; > - ld.param.u32 %ar0, [%in_ar0]; > - .reg.u64 %ar1; > - ld.param.u64 %ar1, [%in_ar1]; > .reg.f64 %r23; > .reg.f32 %r24; > .reg.u32 %r25; > @@ -34,15 +30,15 @@ $L3: > mov.f32 %r24, 0f; > $L1: > mov.f32 %value, %r24; > - st.param.f32[%value_out], %value; > + mov.f32 %value_out, %value; > ret; > } > > // BEGIN GLOBAL FUNCTION DECL: main > -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, > .param.u64 %in_ar1); > +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1); > > // BEGIN GLOBAL FUNCTION DEF: main > -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, > .param.u64 %in_ar1) > +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1) > { > .reg.u32 %value; > .local .align 8 .b8 %frame_ar[32]; > @@ -70,13 +66,9 @@ $L1: > st.u64 [%frame+24], %r29; > add.u64 %r31, %frame, 16; > { > - .param.f32 %value_in; > - .param.u32 %out_arg1; > - st.param.u32 [%out_arg1], %r26; > - .param.u64 %out_arg2; > - st.param.u64 [%out_arg2], %r31; > - call (%value_in), f, (%out_arg1, %out_arg2); > - ld.param.f32%r32, [%value_in]; > + .reg.f32 %value_in; > + call (%value_in), f, (%r26, %r31); > + mov.f32 %r32, %value_in; > } > setp.eq.f32 %r33, %r32, 0f; > @%r33 bra $L5; > @@ -89,17 +81,13 @@ $L5: > st.u64 [%frame+24], %r36; > mov.u32 %r34, 1; > { > - .param.f32 %value_in; > - .param.u32 %out_arg1; > - st.param.u32 [%out_arg1], %r34; > - .param.u64 %out_arg2; > - st.param.u64 [%out_arg2], %r31; > - call (%value_in), f, (%out_arg1, %out_arg2); > - ld.param.f32%r39, [%value_in]; > + .reg.f32 %value_in; > + call (%value_in), f, (%r34, %r31); > + mov.f32 %r39, %value_in; > } > setp.neu.f32%r40, %r39, 0f3f80; > @%r40 bra $L6; > mov.u32 %value, 0; > - st.param.u32[%value_out], %value; > + mov.u32 %value_out, %value; > ret; > } > > (Not yet directly using "%value_out" instead of the intermediate "%value" > register.) > > Is such a patch something to pursue to completion? Are you trying to optimize acc routines in general? I'm not sure how frequently they are used at the moment. Also, while .param values may be overkill for routines, they are addressable. Looking at section 5.1.6.1 in the PTX reference manual, you can have something like this: .entry foo ( .param .b32 N, .param .align 8 .b8 buffer[64] ) { .reg .u32 %n; .reg .f64 %d; ld.param.u32 %n, [N]; ld.param.f64 ... Granted, this is an entry function to be called from the host, but the same usage is applicable inside routines. This gives me an idea. While working on the firstprivate changes, I noticed that GCC packs all of the offloaded function arguments into a structure, which the nvptx run time plugin uploads to a special data mapping prior to calling cuLaunchKernel. That's inefficient in application that launch a
Re: Improving code generation in the nvptx back end
Hi! On Fri, 17 Feb 2017 14:00:09 +0100, I wrote: > [...] for "normal" functions there is no reason to use the > ".param" space for passing arguments in and out of functions. We can > then get rid of the boilerplate code to move ".param %in_ar*" into ".reg > %ar*", and the other way round for "%value_out"/"%value". This will then > also simplify the call sites, where all that code "evaporates". That's > actually something I started to look into, many months ago, and I now > just dug out those changes, and will post them later. > > (Very likely, the PTX "JIT" compiler will do the very same thing without > difficulty, but why not directly generate code that is less verbose to > read?) Using my WIP patch, the generated PTX code changes/is simplified as follows: // BEGIN GLOBAL FUNCTION DECL: f -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1); +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1); // BEGIN GLOBAL FUNCTION DEF: f -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1) +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1) { .reg.f32 %value; - .reg.u32 %ar0; - ld.param.u32 %ar0, [%in_ar0]; - .reg.u64 %ar1; - ld.param.u64 %ar1, [%in_ar1]; .reg.f64 %r23; .reg.f32 %r24; .reg.u32 %r25; @@ -34,15 +30,15 @@ $L3: mov.f32 %r24, 0f; $L1: mov.f32 %value, %r24; - st.param.f32[%value_out], %value; + mov.f32 %value_out, %value; ret; } // BEGIN GLOBAL FUNCTION DECL: main -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1); +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1); // BEGIN GLOBAL FUNCTION DEF: main -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1) +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1) { .reg.u32 %value; .local .align 8 .b8 %frame_ar[32]; @@ -70,13 +66,9 @@ $L1: st.u64 [%frame+24], %r29; add.u64 %r31, %frame, 16; { - .param.f32 %value_in; - .param.u32 %out_arg1; - st.param.u32 [%out_arg1], %r26; - .param.u64 %out_arg2; - st.param.u64 [%out_arg2], %r31; - call (%value_in), f, (%out_arg1, %out_arg2); - ld.param.f32%r32, [%value_in]; + .reg.f32 %value_in; + call (%value_in), f, (%r26, %r31); + mov.f32 %r32, %value_in; } setp.eq.f32 %r33, %r32, 0f; @%r33 bra $L5; @@ -89,17 +81,13 @@ $L5: st.u64 [%frame+24], %r36; mov.u32 %r34, 1; { - .param.f32 %value_in; - .param.u32 %out_arg1; - st.param.u32 [%out_arg1], %r34; - .param.u64 %out_arg2; - st.param.u64 [%out_arg2], %r31; - call (%value_in), f, (%out_arg1, %out_arg2); - ld.param.f32%r39, [%value_in]; + .reg.f32 %value_in; + call (%value_in), f, (%r34, %r31); + mov.f32 %r39, %value_in; } setp.neu.f32%r40, %r39, 0f3f80; @%r40 bra $L6; mov.u32 %value, 0; - st.param.u32[%value_out], %value; + mov.u32 %value_out, %value; ret; } (Not yet directly using "%value_out" instead of the intermediate "%value" register.) Is such a patch something to pursue to completion? --- gcc/config/nvptx/nvptx.c +++ gcc/config/nvptx/nvptx.c @@ -603,19 +603,32 @@ nvptx_promote_function_mode (const_tree type, machine_mode mode, to an argument register and it is greater than zero if we're copying to a specific hard register. */ +static bool write_as_kernel (tree attrs); static int write_arg_mode (std::stringstream &s, int for_reg, int argno, - machine_mode mode) + machine_mode mode, const_tree decl) { + bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl)); const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); if (for_reg < 0) { /* Writing PTX prototype. */ s << (argno ? ", " : " ("); - s << ".param" << ptx_type << " %in_ar" << argno; + if (kernel) + s << ".param" << ptx_type << " %in_ar" << argno; + else +#if 0 + s << ".reg" << ptx_type << " %in_ar" << argno; +#else + s << ".reg" << ptx_type << " %ar" << argno; +#endif } +#if 0 else +#else + else if (kernel || for_reg) +#endif