https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124020

            Bug ID: 124020
           Summary: Improve bitmanip recognition for riscv
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: law at gcc dot gnu.org
  Target Milestone: ---

This is from 500.perlbench.  Consider this code on rv64gcbv_zicond:

typedef struct SHA {
        unsigned char block[128];
        unsigned int blockcnt;

} SHA;

#define BITSET(s, pos)    s[(pos) >> 3] &  (char)  (0x01 << (7 - (pos) % 8))
#define SETBIT(s, pos)    s[(pos) >> 3] |= (char)  (0x01 << (7 - (pos) % 8))
#define CLRBIT(s, pos)    s[(pos) >> 3] &= (char) ~(0x01 << (7 - (pos) % 8))

#define ULNG        unsigned long

void shabits(char *bitstr, long bitcnt, SHA *s)
{
        ULNG i;
        for (i = 0UL; i < bitcnt; i++) {
                if (BITSET(bitstr, i))
                        SETBIT(s->block, s->blockcnt);
                else
                        CLRBIT(s->block, s->blockcnt);
        }
}

We get this code:

        beq     a1,zero,.L1     # 9     [c=16 l=4]  *branchdi
        lw      a7,128(a2)              # 12    [c=28 l=4] 
*extendsidi2_internal/1
        li      a4,7            # 119   [c=4 l=4]  *movdi_64bit/1
        li      t1,1            # 21    [c=4 l=4]  *movsi_internal/1
        andn    a4,a4,a7        # 120   [c=4 l=4]  and_notdi3
        srliw   a5,a7,3 # 13    [c=4 l=4]  *lshrsi3_zero_extend_1
        bset    a7,x0,a4        # 121   [c=8 l=4]  *bsetdi_2
        li      a3,0            # 6     [c=4 l=4]  *movdi_64bit/1
        add     a2,a2,a5        # 75    [c=4 l=4]  *adddi3/0
        andi    a7,a7,0xff      # 24    [c=4 l=4]  *zero_extendqidi2_internal/0
.L5:
        srli    a5,a3,3 # 39    [c=4 l=4]  lshrdi3
        add     a5,a0,a5        # 40    [c=4 l=4]  *adddi3/0
        lbu     a6,0(a5)        # 42    [c=28 l=4] 
*zero_extendqidi2_internal/1
        not     a4,a3   # 32    [c=4 l=4]  one_cmpldi2
        lbu     a5,0(a2)        # 30    [c=28 l=4] 
*zero_extendqidi2_internal/1
        andi    a4,a4,7 # 34    [c=4 l=4]  *anddi3/1
        sllw    a4,t1,a4        # 37    [c=8 l=4]  ashlsi3_extend
        and     a4,a4,a6        # 46    [c=4 l=4]  *anddi3/0
        or      a6,a5,a7        # 101   [c=4 l=4]  *iordi3/0
        andn    a5,a5,a7        # 56    [c=4 l=4]  and_notdi3
        czero.eqz       a6,a6,a4        # 102   [c=4 l=4]  *czero.eqz.didi
        czero.nez       a5,a5,a4        # 103   [c=4 l=4]  *czero.nez.didi
        add     a5,a5,a6        # 104   [c=4 l=4]  *adddi3/0
        addi    a3,a3,1 # 62    [c=4 l=4]  *adddi3/1
        sb      a5,0(a2)        # 61    [c=4 l=4]  *movqi_internal/3
        bne     a3,a1,.L5       # 64    [c=16 l=4]  *branchdi

So on a positive note, at some point we started if-converting the branch inside
the loop.  But there's all kinds of goodies still in here.

First, the andi at insn 24 is redundant.  We can't really clean it up in
combine because of how we're representing andn as a define_insn_and_split.  

In the loop note the OR with a7 as a source.  That's a bset.  Note the ANDN
with a7, that should be a bclr at which point we would not need the bset before
the loop anymore.

Note the sllw+and at 37+46.  Given how it's used, it really should just be a
bext.

Anyway, cleaning these things up in RTL may be painful.  They don't look
particularly easy to tackle in gimple either.   Mostly getting this one
recorded for long term as I don't see good paths forward.

Reply via email to