This patch implements variable stack allocation for alloca/VLA on NVPTX if -msoft-stack is enabled. In addition to moving the stack pointer, we need to copy the updated pointer into __nvptx_stacks[tid.y].
* config/nvptx/nvptx.c (nvptx_declare_function_name): Emit %outargs using .local %outargs_ar only if not TARGET_SOFT_STACK. Emit %outargs under TARGET_SOFT_STACK by offsetting from %frame. (nvptx_get_drap_rtx): Return %argp as the DRAP if needed. * config/nvptx/nvptx.md (nvptx_register_operand): Allow %outargs under TARGET_SOFT_STACK. (nvptx_nonimmediate_operand): Ditto. (allocate_stack): Implement for TARGET_SOFT_STACK. Remove unused code. (allocate_stack_<mode>): Remove unused pattern. (set_softstack_insn): New pattern. (restore_stack_block): Handle for TARGET_SOFT_STACK. --- I have committed this patch to the gomp-nvptx branch. Bernd, Nathan, I would appreciate if you could comment on 'define_predicate' changes in nvptx.md. There are three predicates that start like this: if (REG_P (op)) return !HARD_REGISTER_P (op); if (GET_CODE (op) == SUBREG && MEM_P (SUBREG_REG (op))) return false; if (GET_CODE (op) == SUBREG) return false; For stack adjustments I need to allow operations on the stack pointer. For now I've implemented that as a fairly straightforward shortcut, but I guess it doesn't look very nice. What is the reason to reject "hard registers" there, in the first place? In any case, I'd like your input if you see a better way to handle it. Also, note that there's either a bug or a cleanup opportunity: the third "if" statement is clearly more general than the second. No regressions on check-c testsuite (with 'alloca' effective-target enabled). Thanks. Alexander diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index b12a7a8..599e460 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -883,7 +883,7 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) HOST_WIDE_INT sz = crtl->outgoing_args_size; if (sz == 0) sz = 1; - if (cfun->machine->has_call_with_varargs) + if (!TARGET_SOFT_STACK && cfun->machine->has_call_with_varargs) { fprintf (file, "\t.reg.u%d %%outargs;\n" "\t.local.align 8 .b8 %%outargs_ar[" @@ -897,7 +897,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) sz = get_frame_size (); if (sz == 0 && cfun->machine->has_call_with_sc) sz = 1; - if (sz > 0) + bool need_sp = cfun->calls_alloca || cfun->machine->has_call_with_varargs; + if (sz > 0 || TARGET_SOFT_STACK && need_sp) { int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; @@ -923,10 +924,15 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) if (alignment > keep_align) fprintf (file, "\tand.b%d %%frame, %%frame, %d;\n", bits, -alignment); + fprintf (file, "\t.reg.u%d %%outargs;\n", bits); + sz = crtl->outgoing_args_size; + gcc_assert (sz % keep_align == 0); + fprintf (file, "\tsub.u%d %%outargs, %%frame, " + HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz); /* crtl->is_leaf is not initialized because RA is not run. */ if (!leaf_function_p ()) { - fprintf (file, "\tst.shared.u%d [%%fstmp2], %%frame;\n", bits); + fprintf (file, "\tst.shared.u%d [%%fstmp2], %%outargs;\n", bits); cfun->machine->using_softstack = true; } need_softstack_decl = true; @@ -996,6 +1002,8 @@ nvptx_function_ok_for_sibcall (tree, tree) static rtx nvptx_get_drap_rtx (void) { + if (TARGET_SOFT_STACK && stack_realign_drap) + return arg_pointer_rtx; return NULL_RTX; } diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index ae1909d..130c809 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -69,6 +69,8 @@ (define_attr "divergent" "false,true" (define_predicate "nvptx_register_operand" (match_code "reg,subreg") { + if (TARGET_SOFT_STACK && op == stack_pointer_rtx) + return true; if (REG_P (op)) return !HARD_REGISTER_P (op); if (GET_CODE (op) == SUBREG && MEM_P (SUBREG_REG (op))) @@ -123,6 +125,8 @@ (define_predicate "nvptx_general_operand" (define_predicate "nvptx_nonimmediate_operand" (match_code "reg,subreg,mem") { + if (TARGET_SOFT_STACK && op == stack_pointer_rtx) + return true; if (REG_P (op)) return (op != frame_pointer_rtx && op != arg_pointer_rtx @@ -1061,31 +1065,41 @@ (define_expand "allocate_stack" (match_operand 1 "nvptx_register_operand")] "" { + if (TARGET_SOFT_STACK) + { + emit_move_insn (stack_pointer_rtx, + gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1])); + emit_insn (gen_set_softstack_insn (stack_pointer_rtx)); + emit_move_insn (operands[0], virtual_stack_dynamic_rtx); + DONE; + } /* The ptx documentation specifies an alloca intrinsic (for 32 bit only) but notes it is not implemented. The assembler emits a confused error message. Issue a blunt one now instead. */ sorry ("target cannot support alloca."); emit_insn (gen_nop ()); DONE; - if (TARGET_ABI64) - emit_insn (gen_allocate_stack_di (operands[0], operands[1])); - else - emit_insn (gen_allocate_stack_si (operands[0], operands[1])); - DONE; }) -(define_insn "allocate_stack_<mode>" - [(set (match_operand:P 0 "nvptx_register_operand" "=R") - (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")] - UNSPEC_ALLOCA))] - "" - "%.\\tcall (%0), %%alloca, (%1);") +(define_insn "set_softstack_insn" + [(unspec [(match_operand 0 "nvptx_register_operand" "R")] UNSPEC_ALLOCA)] + "TARGET_SOFT_STACK" +{ + return (cfun->machine->using_softstack + ? "%.\\tst.shared%t0\\t[%%fstmp2], %0;" + : ""); +}) (define_expand "restore_stack_block" [(match_operand 0 "register_operand" "") (match_operand 1 "register_operand" "")] "" { + if (TARGET_SOFT_STACK) + { + emit_move_insn (operands[0], operands[1]); + emit_insn (gen_set_softstack_insn (operands[0])); + } DONE; }) -- 1.8.3.1