On 07/10/2016 05:28, Kyrill Tkachov wrote: > Hi Adhemerval, > > CC'ing the aarch64 maintainers/reviewers. > I have some comments inline, mostly about the GCC coding conventions.
Thanks for the review. >> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c >> index df6514d..c689440 100644 >> --- a/gcc/config/aarch64/aarch64.c >> +++ b/gcc/config/aarch64/aarch64.c >> @@ -3149,6 +3149,49 @@ aarch64_restore_callee_saves (machine_mode mode, >> } >> } >> +static bool >> +split_stack_arg_pointer_used_p (void) >> +{ > > New function needs a function comment. > Ack, I will add one. >> + bool using_split_stack = (flag_split_stack >> + && (lookup_attribute ("no_split_stack", >> + DECL_ATTRIBUTES (cfun->decl)) >> + == NULL)); >> + if (using_split_stack == false) >> + return false; >> + >> + /* If the pseudo holding the arg pointer is no longer a pseudo, >> + then the arg pointer is used. */ >> + if (cfun->machine->frame.split_stack_arg_pointer != NULL_RTX >> + && (!REG_P (cfun->machine->frame.split_stack_arg_pointer) >> + || (REGNO (cfun->machine->frame.split_stack_arg_pointer) >> + < FIRST_PSEUDO_REGISTER))) >> + return true; >> + >> + /* Unfortunately we also need to do some code scanning, since >> + x10 may have been substituted for the pseudo. */ >> + rtx_insn *insn; >> + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; >> + FOR_BB_INSNS (bb, insn) >> + if (NONDEBUG_INSN_P (insn)) >> + { >> + df_ref use; >> + FOR_EACH_INSN_USE (use, insn) >> + { >> + rtx x = DF_REF_REG (use); >> + if (REG_P (x) && REGNO (x) == R10_REGNUM) >> + return true; >> + } >> + df_ref def; >> + FOR_EACH_INSN_DEF (def, insn) >> + { >> + rtx x = DF_REF_REG (def); >> + if (REG_P (x) && REGNO (x) == R10_REGNUM) >> + return false; >> + } >> + } >> + return bitmap_bit_p (DF_LR_OUT (bb), R10_REGNUM); >> +} >> + >> /* AArch64 stack frames generated by this compiler look like: >> +-------------------------------+ >> @@ -3204,6 +3247,7 @@ aarch64_expand_prologue (void) >> unsigned reg1 = cfun->machine->frame.wb_candidate1; >> unsigned reg2 = cfun->machine->frame.wb_candidate2; >> rtx_insn *insn; >> + bool split_stack_arg_pointer_used = split_stack_arg_pointer_used_p (); >> if (flag_stack_usage_info) >> current_function_static_stack_size = frame_size; >> @@ -3220,6 +3264,10 @@ aarch64_expand_prologue (void) >> aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size); >> } >> + /* Save split-stack argument pointer before stack adjustment. */ >> + if (split_stack_arg_pointer_used) >> + emit_move_insn (gen_rtx_REG (Pmode, R10_REGNUM), stack_pointer_rtx); >> + >> aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, >> true); >> if (callee_adjust != 0) >> @@ -3243,6 +3291,30 @@ aarch64_expand_prologue (void) >> callee_adjust != 0 || frame_pointer_needed); >> aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust, >> !frame_pointer_needed); >> + >> + if (split_stack_arg_pointer_used_p ()) >> + { >> + /* Setup the argument pointer (x10) for -fsplit-stack code. If >> + __morestack was called, it will left the arg pointer to the >> + old stack in x28. Otherwise, the argument pointer is the top >> + of current frame. */ >> + rtx x10 = gen_rtx_REG (Pmode, R10_REGNUM); >> + rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM); >> + rtx not_more = gen_label_rtx (); >> + rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); >> + rtx jump; >> + >> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode, >> + gen_rtx_GEU (VOIDmode, cc_reg, >> + const0_rtx), >> + gen_rtx_LABEL_REF (VOIDmode, not_more), >> + pc_rtx); >> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump)); > > Are you trying to emit a conditional jump here? > If so it would be cleaner to use the pattern name you have in mind > directly like so (barring typos ;): > rtx cmp = gen_rtx_fmt_ee (GEU, VOIDmode, cc_reg, const0_rtx); > jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more)); > This will avoid constructing the SET and IF_THEN_ELSE rtxes manually. Yes, the idea is to emit a conditional jump. I have no preference here, I used the IF_THEN_ELSE functions mainly because the other arches are using it, but I think your suggestion should be simpler. > >> + JUMP_LABEL (jump) = not_more; >> + LABEL_NUSES (not_more) += 1; >> + emit_move_insn (x10, x28); >> + emit_label (not_more); >> + } >> } >> /* Return TRUE if we can use a simple_return insn. >> @@ -9677,7 +9749,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx >> nextarg ATTRIBUTE_UNUSED) >> /* Emit code to initialize STACK, which points to the next varargs stack >> argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used >> by named arguments. STACK is 8-byte aligned. */ >> - t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx); >> + t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer); >> if (cum->aapcs_stack_size > 0) >> t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * >> UNITS_PER_WORD); >> t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t); >> @@ -14054,6 +14126,189 @@ aarch64_optab_supported_p (int op, machine_mode >> mode1, machine_mode, >> } >> } >> +/* -fsplit-stack support. */ >> + >> +/* A SYMBOL_REF for __morestack. */ >> +static GTY(()) rtx morestack_ref; >> + >> +/* Emit -fsplit-stack prologue, which goes before the regular function >> + prologue. */ >> +void >> +aarch64_expand_split_stack_prologue (void) >> +{ > > Nit: new line between function comment and its type. Ack. > >> + HOST_WIDE_INT frame_size; >> + rtx_code_label *ok_label = NULL; >> + rtx mem, ssvalue, cc, jump, insn, call_fusage; >> + rtx reg30, temp; >> + rtx new_cfa, cfi_ops = NULL; >> + /* Offset from thread pointer to __private_ss. */ >> + int psso = 0x10; >> + int ninsn; >> + >> + gcc_assert (flag_split_stack && reload_completed); >> + >> + /* A minimal stack frame would be created for __morestack call. */ >> + frame_size = cfun->machine->frame.frame_size + 16; >> + >> + /* It limits total maximum stack allocation on 2G so its value can be >> + materialized with two instruction at most (movn/movk). It might be > > s/with two instruction/using two instructions/ > Ack. >> + used by the linker to add some extra space for split calling non split >> + stack functions. */ >> + if (frame_size > ((HOST_WIDE_INT) 1 << 31)) >> + { >> + sorry ("Stack frame larger than 2G is not supported for >> -fsplit-stack"); >> + return; >> + } >> + >> + if (morestack_ref == NULL_RTX) >> + { >> + morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack"); >> + SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL >> + | SYMBOL_FLAG_FUNCTION); >> + } >> + >> + /* Load __private_ss from TCB. */ >> + ssvalue = gen_rtx_REG (Pmode, R9_REGNUM); >> + emit_insn (gen_aarch64_load_tp_hard (ssvalue)); >> + mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso)); >> + emit_move_insn (ssvalue, mem); >> + >> + temp = gen_rtx_REG (Pmode, R10_REGNUM); >> + >> + /* Always emit two insns to calculate the requested stack, so the linker >> + can edit them when adjusting size for calling non-split-stack code. */ >> + ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true, >> + Pmode); >> + gcc_assert (ninsn == 1 || ninsn == 2); >> + if (ninsn == 1) >> + emit_insn (gen_nop ()); >> + emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp)); >> + >> + /* Jump to __morestack call if current __private_ss is not suffice. */ >> + ok_label = gen_label_rtx (); >> + cc = aarch64_gen_compare_reg (LT, temp, ssvalue); >> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode, >> + gen_rtx_GEU (VOIDmode, cc, const0_rtx), >> + gen_rtx_LABEL_REF (Pmode, ok_label), >> + pc_rtx); >> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump)); > > similar to the above about emitting the conditional jump. Ack. > >> + /* Mark the jump as very likely to be taken. */ >> + add_int_reg_note (jump, REG_BR_PROB, >> + REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100); >> + JUMP_LABEL (jump) = ok_label; >> + LABEL_NUSES (ok_label) = 1; >> + >> + /* Call __morestack with a non-standard call procedure: x10 will hold >> + the requested stack pointer. */ >> + call_fusage = NULL_RTX; >> + >> + /* Set up a minimum frame pointer to call __morestack. The SP is not >> + save on x29 prior so in __morestack x29 points to the called SP. */ >> + reg30 = gen_rtx_REG (Pmode, R30_REGNUM); >> + aarch64_pushwb_single_reg (Pmode, R30_REGNUM, 16); >> + >> + insn = emit_call_insn (gen_call (gen_rtx_MEM (DImode, morestack_ref), >> + const0_rtx, const0_rtx)); >> + add_function_usage_to (insn, call_fusage); >> + >> + cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg30, cfi_ops); >> + mem = plus_constant (Pmode, stack_pointer_rtx, 16); >> + cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops); >> + >> + mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem); >> + mem = gen_rtx_MEM (DImode, mem); >> + insn = emit_move_insn (reg30, mem); >> + >> + new_cfa = stack_pointer_rtx; >> + new_cfa = plus_constant (Pmode, new_cfa, 16); >> + cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops); >> + REG_NOTES (insn) = cfi_ops; >> + RTX_FRAME_RELATED_P (insn) = 1; >> + >> + emit_insn (gen_split_stack_return ()); >> + >> + /* __morestack will call us here. */ >> + emit_label (ok_label); >> +} >> + >> +/* Implement TARGET_ASM_FILE_END. */ >> +static void >> +aarch64_file_end (void) >> +{ > > New line between comment and function type. Ack. > >> + file_end_indicate_exec_stack (); >> + >> + if (flag_split_stack) >> + file_end_indicate_split_stack (); >> +} >> + >> +/* Return the internal arg pointer used for function incoming arguments. */ >> +static rtx >> +aarch64_internal_arg_pointer (void) >> +{ > > Likewise. Ack. > >> + if (flag_split_stack >> + && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl)) >> + == NULL)) >> + { >> + if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX) >> + { >> + rtx pat; >> + >> + cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode); >> + REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1; >> + >> + /* Put the pseudo initialization right after the note at the >> + beginning of the function. */ >> + pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer, >> + gen_rtx_REG (Pmode, R10_REGNUM)); >> + push_topmost_sequence (); >> + emit_insn_after (pat, get_insns ()); >> + pop_topmost_sequence (); >> + } >> + return plus_constant (Pmode, >> cfun->machine->frame.split_stack_arg_pointer, >> + FIRST_PARM_OFFSET (current_function_decl)); >> + } >> + return virtual_incoming_args_rtx; >> +} >> + >> +static void >> +aarch64_live_on_entry (bitmap regs) >> +{ > > Needs function comment. Ack, I will add one. > >> + if (flag_split_stack) >> + bitmap_set_bit (regs, R10_REGNUM); >> +} >> + >> +/* Emit -fsplit-stack dynamic stack allocation space check. */ >> + >> +void >> +aarch64_split_stack_space_check (rtx size, rtx label) >> +{ >> + rtx mem, ssvalue, cc, jump, temp; >> + rtx requested = gen_reg_rtx (Pmode); >> + /* Offset from thread pointer to __private_ss. */ >> + int psso = 0x10; >> + >> + /* Load __private_ss from TCB. */ >> + ssvalue = gen_rtx_REG (Pmode, R9_REGNUM); >> + emit_insn (gen_aarch64_load_tp_hard (ssvalue)); >> + mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso)); >> + emit_move_insn (ssvalue, mem); >> + >> + temp = gen_rtx_REG (Pmode, R10_REGNUM); >> + >> + /* And compare it with frame pointer plus required stack. */ >> + size = force_reg (Pmode, size); >> + emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx, >> size)); >> + >> + /* Jump to __morestack call if current __private_ss is not suffice. */ >> + cc = aarch64_gen_compare_reg (LT, temp, ssvalue); >> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode, >> + gen_rtx_GEU (VOIDmode, cc, const0_rtx), >> + gen_rtx_LABEL_REF (VOIDmode, label), >> + pc_rtx); >> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump)); > > Likewise about emitting conditional jumps. Ack. > > <snip> > >> >> diff --git a/gcc/testsuite/gcc.dg/split-3.c b/gcc/testsuite/gcc.dg/split-3.c >> index 64bbb8c..5ba7616 100644 >> --- a/gcc/testsuite/gcc.dg/split-3.c >> +++ b/gcc/testsuite/gcc.dg/split-3.c >> @@ -40,6 +40,7 @@ down (int i, ...) >> || va_arg (ap, int) != 9 >> || va_arg (ap, int) != 10) >> abort (); >> + va_end (ap); >> if (i > 0) >> { >> diff --git a/gcc/testsuite/gcc.dg/split-6.c b/gcc/testsuite/gcc.dg/split-6.c >> index b32cf8d..b3016ba 100644 >> --- a/gcc/testsuite/gcc.dg/split-6.c >> +++ b/gcc/testsuite/gcc.dg/split-6.c >> @@ -37,6 +37,7 @@ down (int i, ...) >> || va_arg (ap, int) != 9 >> || va_arg (ap, int) != 10) >> abort (); >> + va_end (ap); >> if (i > 0) >> { > > Is this just undefined behaviour in existing tests? > If so, I recommend you split these testsuite fixes into a separate patch > and submit that separately. It can probably go in much sooner independently > of slit stack support. > > Also, I think some tests for split-stack support are appropriate. > At least something that checks that gcc generates the jumps to the appropriate > helper functions? In theory yes, although it does not really invalidate the test since the idea is to check for the variadic arguments. Based on your comment I think it would better to split this patch in 2 with this modifications at first. Regarding the tests, the go testsuite itself covers pretty much all the morestack call usage (it indeed showed a lot of issues with some initial drafts for this patches). So I am not sure how more extensible testsuite we would like to push for this. > > Thanks, > Kyrill