| Issue |
76049
|
| Summary |
[EarlyCSE] Can not merge consecutive bitfield accesses in case of pointers
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
adamszilagyi
|
When consecutive bitfields are set, the `EarlyCSE` pass is able to identify and then remove the load/store pairs which access the individual bitfields one by one, then merge the `and` instruction which sets the bitfield values, but if the bitfields are accessed through pointer, the optimiziation fails to remove the redundant load/store pairs.
Take a look at the following example:
```.C
struct bits {
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 1;
};
struct bits b;
void clear_bits(void) {
b.b0 =
b.b1 =
b.b2 =
b.b4 =
b.b6 =
b.b7 = 0;
}
```
For this code the IR before the `EarlyCSE` pass is the following:
```
define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
%bf.load = load i8, ptr @b, align 4
%bf.clear = and i8 %bf.load, 127
%bf.set = or i8 %bf.clear, 0
store i8 %bf.set, ptr @b, align 4
%bf.load1 = load i8, ptr @b, align 4
%bf.clear2 = and i8 %bf.load1, -65
%bf.set3 = or i8 %bf.clear2, 0
store i8 %bf.set3, ptr @b, align 4
%bf.load4 = load i8, ptr @b, align 4
%bf.clear5 = and i8 %bf.load4, -49
%bf.set6 = or i8 %bf.clear5, 0
store i8 %bf.set6, ptr @b, align 4
%bf.load7 = load i8, ptr @b, align 4
%bf.clear8 = and i8 %bf.load7, -13
%bf.set9 = or i8 %bf.clear8, 0
store i8 %bf.set9, ptr @b, align 4
%bf.load10 = load i8, ptr @b, align 4
%bf.clear11 = and i8 %bf.load10, -3
%bf.set12 = or i8 %bf.clear11, 0
store i8 %bf.set12, ptr @b, align 4
%bf.load13 = load i8, ptr @b, align 4
%bf.clear14 = and i8 %bf.load13, -2
%bf.set15 = or i8 %bf.clear14, 0
store i8 %bf.set15, ptr @b, align 4
ret void
}
```
Here, the `load` always gets the value from `b`, then the `and/or` setting an individual bitfield, and the `store` saves back the result to `b`. But if i do the same thing, just use a pointer to the `bits` struct, the IR will be the following (with the C code):
```.C
struct bits {
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 1;
};
struct bits *b;
void clear_bits(void) {
b->b0 =
b->b1 =
b->b2 =
b->b4 =
b->b6 =
b->b7 = 0;
}
```
```
define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
%0 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load = load i8, ptr %0, align 4
%bf.clear = and i8 %bf.load, 127
%bf.set = or i8 %bf.clear, 0
store i8 %bf.set, ptr %0, align 4
%1 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load1 = load i8, ptr %1, align 4
%bf.clear2 = and i8 %bf.load1, -65
%bf.set3 = or i8 %bf.clear2, 0
store i8 %bf.set3, ptr %1, align 4
%2 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load4 = load i8, ptr %2, align 4
%bf.clear5 = and i8 %bf.load4, -49
%bf.set6 = or i8 %bf.clear5, 0
store i8 %bf.set6, ptr %2, align 4
%3 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load7 = load i8, ptr %3, align 4
%bf.clear8 = and i8 %bf.load7, -13
%bf.set9 = or i8 %bf.clear8, 0
store i8 %bf.set9, ptr %3, align 4
%4 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load10 = load i8, ptr %4, align 4
%bf.clear11 = and i8 %bf.load10, -3
%bf.set12 = or i8 %bf.clear11, 0
store i8 %bf.set12, ptr %4, align 4
%5 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load13 = load i8, ptr %5, align 4
%bf.clear14 = and i8 %bf.load13, -2
%bf.set15 = or i8 %bf.clear14, 0
store i8 %bf.set15, ptr %5, align 4
ret void
}
```
Here each first `load` will get the address of `b` then the second `load` reads the value from that address, the rest is the same. In these cases the `EarlyCSE` pass fails to recognize that the intermediate value `load`s can be removed alongside with the `store`s, so the result will be this quite big machine code (tested if for different architectures, this example is ARM):
```
clear_bits: // @clear_bits
adrp x8, b
ldr x9, [x8, :lo12:b]
ldrb w10, [x9]
and w10, w10, #0xffffffdf
strb w10, [x9]
ldr x9, [x8, :lo12:b]
ldrb w10, [x9]
and w10, w10, #0xffffffef
strb w10, [x9]
ldr x9, [x8, :lo12:b]
ldrb w10, [x9]
and w10, w10, #0xfffffff7
strb w10, [x9]
ldr x9, [x8, :lo12:b]
ldrb w10, [x9]
and w10, w10, #0xfffffffb
strb w10, [x9]
ldr x9, [x8, :lo12:b]
ldrb w10, [x9]
and w10, w10, #0xfffffffd
strb w10, [x9]
ldr x8, [x8, :lo12:b]
ldrb w9, [x8]
and w9, w9, #0xfe
strb w9, [x8]
ret
```
Instead of what GCC does, merging the consecutive bitfield sets:
```
clear_bits:
adrp x0, .LANCHOR0
ldr x1, [x0, #:lo12:.LANCHOR0]
ldrb w0, [x1]
and w0, w0, -64
strb w0, [x1]
ret
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs