https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112935

            Bug ID: 112935
           Summary: [14 Regression] Performance regression in Coremarks
                    crcu8 function
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: xry111 at gcc dot gnu.org
  Target Milestone: ---

typedef __UINT8_TYPE__ ee_u8;
typedef __UINT16_TYPE__ ee_u16;

ee_u16 crcu8(ee_u8 data, ee_u16 crc) {
  ee_u8 i = 0, x16 = 0, carry = 0;

  for (i = 0; i < 8; i++) {
    x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
    data >>= 1;

    if (x16 == 1) {
      crc ^= 0x4002;
      carry = 1;
    } else
      carry = 0;
    crc >>= 1;
    if (carry)
      crc |= 0x8000;
    else
      crc &= 0x7fff;
  }
  return crc;
}

With GCC 13.2.0 -O2, on LoongArch we get:

.L2:
        xor     $r12,$r4,$r14
        andi    $r12,$r12,1
        sub.w   $r12,$r0,$r12
        srli.w  $r4,$r4,1
        and     $r12,$r12,$r15
        addi.w  $r13,$r13,-1
        xor     $r12,$r12,$r4
        bstrpick.w      $r13,$r13,7,0
        srli.d  $r14,$r14,1
        bstrpick.w      $r4,$r12,15,0
        bnez    $r13,.L2

With GCC 14.0.0 -O2:

.L2:
        xor     $r12,$r4,$r14
        andi    $r12,$r12,1
        mul.w   $r12,$r12,$r15
        srli.w  $r4,$r4,1
        addi.w  $r13,$r13,-1
        bstrpick.w      $r13,$r13,7,0
        srli.d  $r14,$r14,1
        xor     $r12,$r12,$r4
        bstrpick.w      $r4,$r12,15,0
        bnez    $r13,.L2

mul.w is slower than sub.w + and.

I'm now setting components to tree-optimization because the difference already
exists in 254t.optimized vs 263t.optimized.  But maybe the tree optimizer is
doing things correctly and we should just add a target-specific optimization.

Reply via email to