https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123848

            Bug ID: 123848
           Summary: gcc excessive unrolling and unnecessary codegen vs.
                    llvm
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vineet.gupta at linux dot dev
  Target Milestone: ---

Created attachment 63506
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=63506&action=edit
test case

The test case has been reduced from BPF kernel self tests (and I don't know if
it is veering off into undefined behavior category, but the orig problem was
seen with gcc bpf backend and seems to manifest with riscv too : the issue
seems to start off in first gimple pass itself).

gcc is generating literal code for checking a struct member and then doing the
recursive call, when llvm elides much of it and generate just the recursive
call.

https://godbolt.org/z/639sYT9jT

I don't know if llvm eliding the conditions is correct itself, but the
unrolling of the basic block 9 time seems weird.

```
enum { TCP_CA_Recovery } typedef atomic_t;
struct in6_addr {
  char u6_addr8[16];
};
struct hlist_node {
  struct hlist_node *next;
  struct hlist_node *pprev;
};
struct timer_list {
  struct hlist_node flags_r_c[14];
};
struct sock_common {
  volatile int in6_addr;
};
struct sock {
  struct sock_common;
  void *sk_policy2;
  void *sk_backlog_rcv;
  void *sk_rcuxarray;
};
struct fastopen_queue {
  char nhc_flags;
  int *nhc_lwtstate;
  struct in6_addr;
  atomic_t nhc_upper_bound;
  int *__attribute__nhc_pcpu_rth_output;
  int __attribute__nhc_rth_input;
  int *nhc_exceptions;
};
struct flowi_tunnel {
  long tun_id;
};
struct flowi_common {
  int flowic_oif;
  int flowic_iif;
  int flowic_l3mdev;
  struct flowi_tunnel;
};
struct flowi_uli {
  char type;
};
struct flowi6 {
  struct flowi_common;
  struct in6_addr;
  struct in6_addr saddr;
  int flowlabel;
  struct flowi_uli;
  int mp_hash;
};
struct flowi {
  struct flowi6;
  int : 1;
};
struct inet_cork {
  int addr;
  int *opt;
  int length;
  int *dst;
  int priority;
  short gso_size;
  int ts_opt_id;
  long transmit_time;
  int mark;
};
struct inet_cork_full {
  struct inet_cork;
  struct flowi;
};
struct inet_sock {
  struct sock;
  int pinet6;
  int *ipv6_fl_list;
  long inet_flags;
  short inet_sport;
  int *inet_opt;
  atomic_t inet_id;
  char min_ttl;
  char mc_ttl;
  char pmtudisc;
  char rcv_tos;
  char convert_csum;
  int uc_index;
  int mc_index;
  int mc_addr;
  int local_port_range;
  int *mc_list;
  struct inet_cork_full;
};
struct request_sock_queue {
  char synflood_warned;
  atomic_t qlen;
  atomic_t young;
  int *rskq_accept_head;
  int rskq_accept_tail;
  struct fastopen_queue;
};
struct {
  struct inet_sock icsk_inet;
  struct request_sock_queue;
  int icsk_bind_hash;
  int *icsk_bind2_hash;
  struct timer_list icsk_delack_timer;
  struct timer_list;
  int icsk_rto;
  int icsk_rto_min;
  int icsk_rto_max;
  int icsk_delack_max;
  int icsk_pmtu_cookie;
  int *icsk_ca_ops;
  int icsk_af_ops;
  int *icsk_ulp_ops;
  void *icsk_ulp_data;
  int *icsk_sync_mss;
  char icsk_ca_state : 5;
  int lrcvtime;
  short rcv_mss;
  int : 20;
  int retry;
  int search_high;
  int search_low;
  int : 1;
  int probe_timestamp;
  int icsk_probes_tstamp;
  int icsk_user_timeout;
  long icsk_ca_priv[13];
} * inet_csk() {
  if (inet_csk()->icsk_ca_state)
    inet_csk();
}

```

clang -O2

inet_csk:
        addi    sp, sp, -16
        sd      ra, 8(sp)
        call    inet_csk

gcc -O2 (BPF)

inet_csk:
        call    inet_csk
        r6 = *(u8 *) (r0+384)
        r6 &= 31
        if r6 != 0 goto .L185
        r0 = *(u8 *) (r6+384)
        w0 &= 31
        if w0 != 0 goto .L186
        r1 = 0
        r2 = *(u8 *) (r1+384)
        w2 &= 31
        if w2 != 0 goto .L9
        r3 = 0
        r4 = *(u8 *) (r3+384)
        w4 &= 31
        if w4 != 0 goto .L13
        r5 = 0
        r9 = *(u8 *) (r5+384)
        w9 &= 31
        if w9 == 0 goto .L15
.L16:
        r9 = 0
        r0 = *(u8 *) (r9+384)
        w0 &= 31
        if w0 != 0 goto .L187
.L17:
        r1 = 0
        r2 = *(u8 *) (r1+384)
        w2 &= 31
        if w2 != 0 goto .L188
.L18:
        r3 = 0
        r4 = *(u8 *) (r3+384)
        w4 &= 31
        if w4 == 0 goto .L15
        call    inet_csk

gcc -O2 (RISC-V)

inet_csk:
        addi    sp,sp,-16
        sd      ra,8(sp)
.L2:
        call    inet_csk
        lbu     a5,384(a0)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        lbu     a5,384(zero)
        andi    a5,a5,31
        bne     a5,zero,.L2
        ld      ra,8(sp)
        addi    sp,sp,16
        jr      ra

Gimple seems be ok

struct 
{
  struct inet_sock icsk_inet;
  struct request_sock_queue D.2024;
...
...
} * inet_csk ()
{
  _1 = inet_csk ();
  _2 = BIT_FIELD_REF <*_1, 8, 7040>;
  _3 = _2 & 31;
  if (_3 != 0) goto <D.2057>; else goto <D.2058>;
  <D.2057>:
  inet_csk ();
  <D.2058>:
}

Tree SSA pass duplicates the struct, while fixup_cfg3 generates 119 instances
(BPF) while on riscv it generates 26.

Reply via email to