Thanks for the feedback, Richard.  We've addressed the bulk of it, and
added some explanatory comments in the few cases where the current
implementation makes sense, but for less than obvious reasons.  We
will submit a v2 patch set reflecting these changes in the next couple
of days.

A few responses are in-line below.

On Fri, Jan 20, 2017 at 6:32 PM, Richard Henderson <r...@redhat.com> wrote:
>
> On 01/11/2017 06:30 PM, Palmer Dabbelt wrote:
>>
>> +/* The largest number of operations needed to load an integer constant.
>> +   The worst case is LUI, ADDI, SLLI, ADDI, SLLI, ADDI, SLLI, ADDI,
>> +   but we may attempt and reject even worse sequences.  */
>> +#define RISCV_MAX_INTEGER_OPS 32
>
>
> Why would you?  Surely after you exhaust 8 you'd just abandon that search as 
> unprofitable.
>
>> +  if (cost > 2 && (value & 1) == 0)
>> +    {
>> +      int shift = 0;
>> +      while ((value & 1) == 0)
>> +       shift++, value >>= 1;
>
>
>   shift = ctz_hwi (value);
>
> You also may want to test for
>
>   value | (HOST_WIDE_INT_M1U << (HOST_BITS_PER_WIDE_INT - shift))
>
> i.e. shifting out leading 1's.  As well as checking with shift - 12.  The 
> latter is interesting for shifting a 20-bit value up into the high word.
>
> I once proposed a generic framework for this that cached results for 
> computation of sequences and costs.  Unfortunately it never gained traction. 
> Perhaps it's time to try again.
>
>> +      /* Try filling trailing bits with 1s.  */
>> +      while ((value << shift) >= 0)
>> +       shift++;
>
>
> clz_hwi.
>
>> +  return GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0;
>
>
> SYMBOL_REF_P.
>
>> +riscv_symbol_binds_local_p (const_rtx x)
>> +{
>> +  return (SYMBOL_REF_DECL (x)
>> +         ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
>> +         : SYMBOL_REF_LOCAL_P (x));
>
>
> Missing SYMOL_REF_P?
>
> Surely encode_section_info will already have set SYMBOL_FLAG_LOCAL, and thus 
> you need not invoke targetm.binds_local_p again.
>
>> +    case LABEL_REF:
>> +      if (LABEL_REF_NONLOCAL_P (x))
>> +       return SYMBOL_GOT_DISP;
>> +      break;
>
>
> Non-local labels are not local to the current function, but they are still 
> local to the translation unit (they'll be local to one of the outer functions 
> of a nested function).
>
>> +  switch (*symbol_type)
>> +    {
>> +    case SYMBOL_ABSOLUTE:
>> +    case SYMBOL_PCREL:
>> +    case SYMBOL_TLS_LE:
>> +      return (int32_t) INTVAL (offset) == INTVAL (offset);
>
>
> Why?
>
>
>> +    case MINUS:
>> +      if (float_mode_p
>> +         && !HONOR_NANS (mode)
>> +         && !HONOR_SIGNED_ZEROS (mode))
>> +       {
>> +         /* See if we can use NMADD or NMSUB.  See riscv.md for the
>> +            associated patterns.  */
>> +         rtx op0 = XEXP (x, 0);
>> +         rtx op1 = XEXP (x, 1);
>> +         if (GET_CODE (op0) == MULT && GET_CODE (XEXP (op0, 0)) == NEG)
>> +           {
>> +             *total = (tune_info->fp_mul[mode == DFmode]
>> +                       + set_src_cost (XEXP (XEXP (op0, 0), 0), mode, speed)
>> +                       + set_src_cost (XEXP (op0, 1), mode, speed)
>> +                       + set_src_cost (op1, mode, speed));
>> +             return true;
>> +           }
>> +         if (GET_CODE (op1) == MULT)
>> +           {
>> +             *total = (tune_info->fp_mul[mode == DFmode]
>> +                       + set_src_cost (op0, mode, speed)
>> +                       + set_src_cost (XEXP (op1, 0), mode, speed)
>> +                       + set_src_cost (XEXP (op1, 1), mode, speed));
>> +             return true;
>> +           }
>> +       }
>
>
> Do we not fold these to FMA + NEG?  If not, that's surprising and maybe 
> should be fixed.  Also, you appear to be missing costs for FMA in 
> riscv_rtx_costs.
>
>> +       case UNORDERED:
>> +         *code = EQ;
>> +         /* Fall through.  */
>> +
>> +       case ORDERED:
>> +         /* a == a && b == b */
>> +         tmp0 = gen_reg_rtx (SImode);
>> +         riscv_emit_binary (EQ, tmp0, cmp_op0, cmp_op0);
>> +         tmp1 = gen_reg_rtx (SImode);
>> +         riscv_emit_binary (EQ, tmp1, cmp_op1, cmp_op1);
>> +         *op0 = gen_reg_rtx (SImode);
>> +         riscv_emit_binary (AND, *op0, tmp0, tmp1);
>> +         break;
>
>
> Better with FCLASS + AND?  At least for a branch?

The code path is slightly shorter using FEQ instead.  If FEQ were two
or more cycles slower than FCLASS, then FCLASS would be a win for
latency, but that is not the case for any known implementation.

>
>> +static int
>> +riscv_flatten_aggregate_field (const_tree type,
>> +                              riscv_aggregate_field fields[2],
>> +                              int n, HOST_WIDE_INT offset)
>
>
> I don't see any code within to bound N to 2, so as not to overflow FIELDS.  
> Am I missing something?
>
> Are you missing code for COMPLEX_TYPE?  In the default case I only see 
> SCALAR_FLOAT_TYPE_P.
>
>> +  memset (info, 0, sizeof (*info));
>> +  info->gpr_offset = cum->num_gprs;
>> +  info->fpr_offset = cum->num_fprs;
>
>
> Since GPRs and FPRs are allocated sequentially, and indicies that are used 
> for GPRs are unused in FPRs and vice versa, why store both gpr_offset and 
> gpr_offset?

The ISA spec references an out-of-date calling convention, and will be
removed in the next revision to orthogonalize the ABI from the ISA.
We are in the process of drafting a RISC-V ELF psABI spec, which the
GCC port targets.
https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md
In this calling convention, the GPRs and FPRs are allocated
separately, which is more performant, especially when XLEN and FLEN
are not equal.

>
>> +/* Emit straight-line code to move LENGTH bytes from SRC to DEST.
>> +   Assume that the areas do not overlap.  */
>> +
>> +static void
>> +riscv_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
>> +{
>> +  HOST_WIDE_INT offset, delta;
>> +  unsigned HOST_WIDE_INT bits;
>> +  int i;
>> +  enum machine_mode mode;
>> +  rtx *regs;
>> +
>> +  bits = MAX (BITS_PER_UNIT,
>> +             MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))));
>> +
>> +  mode = mode_for_size (bits, MODE_INT, 0);
>> +  delta = bits / BITS_PER_UNIT;
>> +
>> +  /* Allocate a buffer for the temporary registers.  */
>> +  regs = XALLOCAVEC (rtx, length / delta);
>> +
>> +  /* Load as many BITS-sized chunks as possible.  Use a normal load if
>> +     the source has enough alignment, otherwise use left/right pairs.  */
>
>
> The comment sounds like it was copied from MIPS.
>
> Do you actually need to implement anything related to block moves at all? 
> There's nothing processor specific in RISCV wrt moves...
>
>> +   'z' Print $0 if OP is zero, otherwise print OP normally.  */
>
>
> Global replase $0 with x0?  I assume that's a copy from elsewhere.
>
>> +static bool
>> +riscv_size_ok_for_small_data_p (int size)
>> +{
>> +  return g_switch_value && IN_RANGE (size, 1, g_switch_value);
>> +}
>
>
> Huh.  With only a 4k displacement from gp, it's still worthwhile to manage 
> small data?


It is worthwhile enough for for small (embedded) programs, where
sometimes the entire static data section fits within 4 KiB.  For
larger programs, there is still an incremental benefit, since the
linker relaxation uses gp opportunistically.  The savings is usually
greater than having another temporary register available.

>
>> +  /* Move past any dynamic stack allocations.  */
>> +  if (cfun->calls_alloca)
>> +    {
>> +      rtx adjust = GEN_INT (-frame->hard_frame_pointer_offset);
>> +      if (!SMALL_OPERAND (INTVAL (adjust)))
>> +       {
>> +         riscv_emit_move (RISCV_PROLOGUE_TEMP (Pmode), adjust);
>> +         adjust = RISCV_PROLOGUE_TEMP (Pmode);
>> +       }
>> +
>> +      emit_insn (gen_add3_insn (stack_pointer_rtx, hard_frame_pointer_rtx,
>> +                               adjust));
>> +    }
>
>
> Normally we require some sort of FP/SP tie, and memory barrier, when 
> deallocating portions of the stack frame.  Otherwise scheduling can produce 
> accesses to addresses below the stack pointer.
>
> If you've got alloca, then you can free up lots of stack all at once; you 
> needn't simply reset SP to where it ought to be.  You can free up everything 
> up to HFP itself, since that's below the register saves.
>
>> +static bool
>> +riscv_scalar_mode_supported_p (enum machine_mode mode)
>> +{
>> +  if (ALL_FIXED_POINT_MODE_P (mode)
>> +      && GET_MODE_PRECISION (mode) <= 2 * BITS_PER_WORD)
>> +    return true;
>> +
>> +  return default_scalar_mode_supported_p (mode);
>> +}
>
>
> You really support fixed-point modes?  Not in hardware, certainly, so... why?
>
>> +/* Implement TARGET_TRAMPOLINE_INIT.  */
>> +
>> +static void
>> +riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
>> +{
>> +  rtx addr, end_addr, mem;
>> +  uint32_t trampoline[4];
>> +  unsigned int i;
>> +  HOST_WIDE_INT static_chain_offset, target_function_offset;
>> +
>> +  /* Work out the offsets of the pointers from the start of the
>> +     trampoline code.  */
>> +  gcc_assert (ARRAY_SIZE (trampoline) * 4 == TRAMPOLINE_CODE_SIZE);
>> +  static_chain_offset = TRAMPOLINE_CODE_SIZE;
>> +  target_function_offset = static_chain_offset + GET_MODE_SIZE (ptr_mode);
>> +
>> +  /* Get pointers to the beginning and end of the code block.  */
>> +  addr = force_reg (Pmode, XEXP (m_tramp, 0));
>> +  end_addr = riscv_force_binary (Pmode, PLUS, addr, GEN_INT 
>> (TRAMPOLINE_CODE_SIZE));
>> +
>> +  /* auipc   t0, 0
>> +     l[wd]   t1, target_function_offset(t0)
>> +     l[wd]   t0, static_chain_offset(t0)
>> +     jr      t1
>> +  */
>
>
> For rv32 (or any ilp32), surely
>
>         lui     t0, hi(chain)
>         lui     t1, hi(func)
>         addi    t0, t0, lo(chain)
>         jr      r1, lo(func)
>
> is better.
>
>> +#ifdef HAVE_AS_TLS
>> +#undef TARGET_HAVE_TLS
>> +#define TARGET_HAVE_TLS true
>> +#endif
>
>
> When would you not have TLS?  Nor DTPRELWORD?
>
>
> r~

Reply via email to