Issue 76049
Summary [EarlyCSE] Can not merge consecutive bitfield accesses in case of pointers
Labels new issue
Assignees
Reporter adamszilagyi
    When consecutive bitfields are set, the `EarlyCSE` pass is able to identify and then remove the load/store pairs which access the individual bitfields one by one, then merge the `and` instruction which sets the bitfield values, but if the bitfields are accessed through pointer, the optimiziation fails to remove the redundant load/store pairs.

Take a look at the following example:

```.C
struct bits {
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
 unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 1;
};

struct bits b;

void clear_bits(void) { 
  b.b0 = 
  b.b1 = 
  b.b2 = 
  b.b4 = 
  b.b6 = 
  b.b7 = 0;
}
```

For this code the IR before the `EarlyCSE` pass is the following:

```
define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
  %bf.load = load i8, ptr @b, align 4
  %bf.clear = and i8 %bf.load, 127
  %bf.set = or i8 %bf.clear, 0
  store i8 %bf.set, ptr @b, align 4
  %bf.load1 = load i8, ptr @b, align 4
  %bf.clear2 = and i8 %bf.load1, -65
  %bf.set3 = or i8 %bf.clear2, 0
  store i8 %bf.set3, ptr @b, align 4
  %bf.load4 = load i8, ptr @b, align 4
  %bf.clear5 = and i8 %bf.load4, -49
  %bf.set6 = or i8 %bf.clear5, 0
  store i8 %bf.set6, ptr @b, align 4
  %bf.load7 = load i8, ptr @b, align 4
 %bf.clear8 = and i8 %bf.load7, -13
  %bf.set9 = or i8 %bf.clear8, 0
 store i8 %bf.set9, ptr @b, align 4
  %bf.load10 = load i8, ptr @b, align 4
  %bf.clear11 = and i8 %bf.load10, -3
  %bf.set12 = or i8 %bf.clear11, 0
  store i8 %bf.set12, ptr @b, align 4
  %bf.load13 = load i8, ptr @b, align 4
  %bf.clear14 = and i8 %bf.load13, -2
 %bf.set15 = or i8 %bf.clear14, 0
  store i8 %bf.set15, ptr @b, align 4
 ret void
}
```

Here, the `load` always gets the value from `b`, then the   `and/or` setting an individual bitfield, and the `store` saves back the result to `b`. But if i do the same thing, just use a pointer to the `bits` struct, the IR will be the following (with the C code):

```.C
struct bits {
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  unsigned b4 : 2;
  unsigned b6 : 1;
 unsigned b7 : 1;
};

struct bits *b;

void clear_bits(void) { 
  b->b0 = 
  b->b1 = 
  b->b2 = 
  b->b4 = 
  b->b6 = 
 b->b7 = 0;
}
```

```
define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
  %0 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load = load i8, ptr %0, align 4
  %bf.clear = and i8 %bf.load, 127
  %bf.set = or i8 %bf.clear, 0
  store i8 %bf.set, ptr %0, align 4
  %1 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load1 = load i8, ptr %1, align 4
  %bf.clear2 = and i8 %bf.load1, -65
  %bf.set3 = or i8 %bf.clear2, 0
  store i8 %bf.set3, ptr %1, align 4
  %2 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load4 = load i8, ptr %2, align 4
 %bf.clear5 = and i8 %bf.load4, -49
  %bf.set6 = or i8 %bf.clear5, 0
 store i8 %bf.set6, ptr %2, align 4
  %3 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load7 = load i8, ptr %3, align 4
  %bf.clear8 = and i8 %bf.load7, -13
  %bf.set9 = or i8 %bf.clear8, 0
  store i8 %bf.set9, ptr %3, align 4
  %4 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load10 = load i8, ptr %4, align 4
  %bf.clear11 = and i8 %bf.load10, -3
 %bf.set12 = or i8 %bf.clear11, 0
  store i8 %bf.set12, ptr %4, align 4
 %5 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load13 = load i8, ptr %5, align 4
  %bf.clear14 = and i8 %bf.load13, -2
  %bf.set15 = or i8 %bf.clear14, 0
  store i8 %bf.set15, ptr %5, align 4
  ret void
}
```

Here each first `load` will get the address of `b` then the second  `load` reads the value from that address, the rest is the same. In these cases the `EarlyCSE` pass fails to recognize that the intermediate value `load`s can be removed alongside with the `store`s, so the result will be this quite big machine code (tested if for different architectures, this example is ARM):

```
clear_bits: // @clear_bits
        adrp    x8, b
        ldr     x9, [x8, :lo12:b]
        ldrb    w10, [x9]
        and     w10, w10, #0xffffffdf
        strb    w10, [x9]
        ldr     x9, [x8, :lo12:b]
        ldrb    w10, [x9]
        and     w10, w10, #0xffffffef
        strb    w10, [x9]
        ldr     x9, [x8, :lo12:b]
        ldrb    w10, [x9]
        and     w10, w10, #0xfffffff7
        strb    w10, [x9]
        ldr     x9, [x8, :lo12:b]
        ldrb    w10, [x9]
        and     w10, w10, #0xfffffffb
        strb    w10, [x9]
        ldr     x9, [x8, :lo12:b]
        ldrb    w10, [x9]
        and     w10, w10, #0xfffffffd
        strb    w10, [x9]
        ldr     x8, [x8, :lo12:b]
        ldrb    w9, [x8]
        and     w9, w9, #0xfe
 strb    w9, [x8]
        ret
```

Instead of what GCC does, merging the consecutive bitfield sets:

```
clear_bits:
 adrp    x0, .LANCHOR0
        ldr     x1, [x0, #:lo12:.LANCHOR0]
 ldrb    w0, [x1]
        and     w0, w0, -64
        strb    w0, [x1]
        ret
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to